crystal-technologies
/

CRYSTAL-Mac

Model card Files Files and versions Community

crystal-technologies commited on Dec 9, 2023

Commit

de4ade4

1 Parent(s): 82c3d93

Upload 303 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Perceptrix/__init__.py +2 -0
Perceptrix/chat.py +125 -0
Perceptrix/create_data/interface.py +152 -0
Perceptrix/create_data/static/style.css +154 -0
Perceptrix/create_data/templates/index.html +80 -0
Perceptrix/engine.py +213 -0
Perceptrix/finetune/Dockerfile +13 -0
Perceptrix/finetune/Makefile +23 -0
Perceptrix/finetune/README.md +265 -0
Perceptrix/finetune/build/lib/inference/__init__.py +4 -0
Perceptrix/finetune/build/lib/inference/convert_composer_mpt_to_ft.py +232 -0
Perceptrix/finetune/build/lib/inference/convert_composer_to_hf.py +290 -0
Perceptrix/finetune/build/lib/inference/convert_hf_mpt_to_ft.py +154 -0
Perceptrix/finetune/build/lib/inference/convert_hf_to_onnx.py +229 -0
Perceptrix/finetune/build/lib/inference/hf_chat.py +389 -0
Perceptrix/finetune/build/lib/inference/hf_generate.py +372 -0
Perceptrix/finetune/build/lib/inference/run_mpt_with_ft.py +480 -0
Perceptrix/finetune/build/lib/llmfoundry/__init__.py +71 -0
Perceptrix/finetune/build/lib/llmfoundry/callbacks/__init__.py +31 -0
Perceptrix/finetune/build/lib/llmfoundry/callbacks/eval_gauntlet_callback.py +177 -0
Perceptrix/finetune/build/lib/llmfoundry/callbacks/fdiff_callback.py +67 -0
Perceptrix/finetune/build/lib/llmfoundry/callbacks/generate_callback.py +30 -0
Perceptrix/finetune/build/lib/llmfoundry/callbacks/hf_checkpointer.py +167 -0
Perceptrix/finetune/build/lib/llmfoundry/callbacks/model_gauntlet_callback.py +21 -0
Perceptrix/finetune/build/lib/llmfoundry/callbacks/monolithic_ckpt_callback.py +115 -0
Perceptrix/finetune/build/lib/llmfoundry/callbacks/resumption_callbacks.py +89 -0
Perceptrix/finetune/build/lib/llmfoundry/callbacks/scheduled_gc_callback.py +75 -0
Perceptrix/finetune/build/lib/llmfoundry/data/__init__.py +21 -0
Perceptrix/finetune/build/lib/llmfoundry/data/data.py +117 -0
Perceptrix/finetune/build/lib/llmfoundry/data/denoising.py +937 -0
Perceptrix/finetune/build/lib/llmfoundry/data/finetuning/__init__.py +7 -0
Perceptrix/finetune/build/lib/llmfoundry/data/finetuning/collator.py +343 -0
Perceptrix/finetune/build/lib/llmfoundry/data/finetuning/dataloader.py +516 -0
Perceptrix/finetune/build/lib/llmfoundry/data/finetuning/tasks.py +433 -0
Perceptrix/finetune/build/lib/llmfoundry/data/packing.py +423 -0
Perceptrix/finetune/build/lib/llmfoundry/data/text_data.py +367 -0
Perceptrix/finetune/build/lib/llmfoundry/models/__init__.py +18 -0
Perceptrix/finetune/build/lib/llmfoundry/models/hf/__init__.py +18 -0
Perceptrix/finetune/build/lib/llmfoundry/models/hf/hf_causal_lm.py +227 -0
Perceptrix/finetune/build/lib/llmfoundry/models/hf/hf_fsdp.py +257 -0
Perceptrix/finetune/build/lib/llmfoundry/models/hf/hf_prefix_lm.py +150 -0
Perceptrix/finetune/build/lib/llmfoundry/models/hf/hf_t5.py +134 -0
Perceptrix/finetune/build/lib/llmfoundry/models/hf/model_wrapper.py +108 -0
Perceptrix/finetune/build/lib/llmfoundry/models/inference_api_wrapper/__init__.py +13 -0
Perceptrix/finetune/build/lib/llmfoundry/models/inference_api_wrapper/interface.py +110 -0
Perceptrix/finetune/build/lib/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py +243 -0
Perceptrix/finetune/build/lib/llmfoundry/models/layers/__init__.py +32 -0
Perceptrix/finetune/build/lib/llmfoundry/models/layers/attention.py +768 -0
Perceptrix/finetune/build/lib/llmfoundry/models/layers/blocks.py +117 -0
Perceptrix/finetune/build/lib/llmfoundry/models/layers/custom_embedding.py +14 -0

Perceptrix/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # from Perceptrix.engine import robotix, identify_objects_from_text, search_keyword
2	+ # from Perceptrix.chat import perceptrix

Perceptrix/chat.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig, GenerationConfig
+from Perceptrix.streamer import TextStreamer
+from utils import setup_device
+import torch
+import time
+import os
+model_name = os.environ.get('CHAT_MODEL')
+model_path = "models/CRYSTAL-chat" if model_name == None else model_name
+config = AutoConfig.from_pretrained(
+    model_path, trust_remote_code=True)
+device = setup_device()
+device = "mps"
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    config=config,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True,
+    device_map="auto",
+    torch_dtype=torch.float16,
+    # quantization_config=bnb_config,
+    offload_folder="offloads",
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+)
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "left"
+tokenizer = tokenizer
+model.eval()
+streamer = TextStreamer(tokenizer, skip_prompt=True,
+                        skip_special_tokens=True, save_file="reply.txt")
+def evaluate(
+    prompt='',
+    temperature=0.4,
+    top_p=0.65,
+    top_k=35,
+    repetition_penalty=1.1,
+    max_new_tokens=512,
+    **kwargs,
+):
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(device)
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        **kwargs,
+    )
+    with torch.no_grad():
+        generation_output = model.generate(
+            input_ids=input_ids,
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+            output_scores=True,
+            max_new_tokens=max_new_tokens,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+            streamer=streamer,
+        )
+    s = generation_output.sequences[0]
+    output = tokenizer.decode(s, skip_special_tokens=True)
+    yield output.split("### Response:")[-1].strip()
+def predict(
+    inputs,
+    temperature=0.4,
+    top_p=0.65,
+    top_k=35,
+    repetition_penalty=1.1,
+    max_new_tokens=512,
+):
+    now_prompt = inputs
+    response = evaluate(
+        now_prompt, temperature, top_p, top_k, repetition_penalty, max_new_tokens, do_sample=True
+    )
+    for i in response:
+        print(i)
+        response = i
+    return response
+instructions = "You are Comprehensive Robotics Yielding Sophisticated Technology And Logistics (CRYSTAL), an AI robot developed by Vatsal Dutt to be the most advanced robot in the world. You will be provided with prompts and other information to help the user."
+def perceptrix(prompt):
+    prompt = instructions+"\n"+prompt
+    response = predict(
+        inputs=prompt, temperature=0.2, top_p=0.9, max_new_tokens=512
+    )
+    spl_tokens = ["<|im_start|>", "<|im_end|>"]
+    clean_prompt = prompt.replace(spl_tokens[0], "").replace(spl_tokens[1], "")
+    return response[len(clean_prompt):]
+if __name__ == "__main__":
+    history = ""
+    while True:
+        user_input = input("User: ")
+        start = time.time()
+        user_input = "<|im_start|>User\n"+user_input+"<|im_end|>\n<|im_start|>CRYSTAL\n"
+        result = perceptrix(history+user_input)
+        history += user_input + result + "<|im_end|>\n"
+        print("Answer completed in ~", round(time.time()-start), "seconds")

Perceptrix/create_data/interface.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from flask import Flask, render_template, request
+import random
+import json
+app = Flask(__name__)
+data_file_path = "Perceptrix/finetune/finetune-data/crystal-finetune.json"
+with open(data_file_path, 'r') as data_file:
+    data = json.loads(data_file.read())
+    file_data = data_file.read()
+room_descriptions = ["The surroundings include a bed, a chair, and a dog. The bed is made up with a white blanket, and there are two people sitting on it, likely a man and a woman. The chair is positioned next to the bed, and the dog is sitting on the bed as well. The room appears to be a bedroom, and the atmosphere seems to be cozy and comfortable.",
+"In the image, there is a living room with a couch, a chair, a coffee table, and a window. The room is well-decorated and filled with furniture, including a bed, a desk, and a dining table. The living room is situated next to the bedroom, and there is a window in the living room. The overall atmosphere of the room is cozy and inviting.",
+"The surroundings include a living room with a couch, a chair, and a coffee table. There is also a television in the room.",
+"In the image, there is a group of people sitting in chairs, likely in a classroom or a meeting room. They are engaged in a discussion or a presentation, with some of them looking at a screen.",
+"The surroundings include a living room with a couch, a chair, and a window. The room is well-lit, and there are several potted plants in the space.",
+"In the image, a woman is lying on a bed surrounded by a variety of stuffed animals. There are at least ten stuffed animals of different colors and sizes, including teddy bears, dolls, and other toys. The scene appears to be cozy and comfortable, with the woman resting peacefully in her bed.",
+"The surroundings include a living room with a yellow couch, a table, and a potted plant. There are three people sitting on the couch, and a laptop is placed on the table.",
+"The surroundings include a living room with a couch, a coffee table, and a TV. The room is filled with people, and they are sitting on the couch, engaging in a conversation.",
+"The surroundings include a living room with a couch, a chair, and a window. There is also a woman standing in the room, possibly near the window. The room appears to be clean and well-maintained.",
+"The surroundings include a living room with a bed, a couch, a chair, and a TV. There are also various items scattered around, such as a book, a bottle, and a cup. The room appears to be messy and disorganized, with some items like the book and bottle being placed on the floor.",
+"In the image, there is a bedroom with a bed, a chair, and a window. A woman is sitting on the bed, and a dog is nearby. The woman is wearing a white shirt and appears to be engaged in a conversation with the AI assistant. The room appears to be clean and well-organized.",
+"I am in a living room, sitting on a couch, and using a laptop.",
+"The surroundings include a couch, a chair, a window, and a potted plant. There is also a person sitting on the couch, and a baby is laying on the person's lap.",
+"In the image, there is a group of people standing in a living room, with a bed and a couch visible in the background. The room appears to be clean and well-organized.",
+"In the image, there is a living room with a white couch, a chair, and a window. The room is well-lit and appears to be clean and organized.",
+"The surroundings consist of a large, empty room with a hardwood floor. There is a man sitting on the floor, possibly in a corner or a cubicle, and he is holding a remote control.",
+"The surroundings include a living room with a woman standing in front of a door, which is open. The room appears to be dimly lit, creating a somewhat dark atmosphere.",
+"The surroundings include a living room with a couch, a chair, and a TV. There are three people sitting on the couch, and a baby is present. The living room appears to be a comfortable and cozy space for the family to spend time together.",
+"In the image, there is a person sitting on a bed in a bedroom. The bed is surrounded by a colorful blanket, and there is a laptop on the bed. The room appears to be a small bedroom, and the bed is positioned near a window.",
+"The surroundings include a bedroom with a bed, a nightstand, and a window. The bed is neatly made and has a white and gray color scheme. There are also potted plants in the room, adding a touch of greenery and a sense of freshness.",
+"In the image, there is a large bedroom with a bed, a nightstand, and a window. The room is clean and well-organized, with a white color scheme and a minimalist design. The bed is neatly made, and there are pillows on it. The room also has a chair and a potted plant, adding a touch of warmth and natural elements to the space. The window provides natural light, and the room appears to be well-lit and inviting.",
+"In the image, there is a living room with a couch, a chair, and a coffee table. The room is well-decorated and features a dark color scheme, with a black couch and a black coffee table. There is also a potted plant in the room, adding a touch of greenery to the space. The living room is well-lit, and there are several books scattered around the room, suggesting that the occupants enjoy reading.",
+"In the image, there is a living room with a large window, a couch, and a chair. The room is filled with furniture, including a coffee table, a dining table, and a potted plant. The living room has a modern and clean design, with a white color scheme. The large window allows for natural light to enter the room, creating a bright and inviting atmosphere.",
+"In the image, there is a bedroom with a large bed, a nightstand, and a window. The room is well-lit and clean, creating a comfortable and inviting atmosphere.",
+"The surroundings include a living room with a couch, a coffee table, and a television. The room is filled with various items, such as books, a vase, and a potted plant. The living room is well-lit, and there is a window in the room. Additionally, there is a dining table and chairs, which suggests that the living room and dining area are combined.",
+"In the image, I am surrounded by a living room with a piano, a couch, and a chair. The living room has a modern design, and the furniture is arranged in a way that creates a comfortable and inviting atmosphere.",
+"The surroundings include a living room with a couch, a coffee table, and a vase. The living room is well-decorated and has a clean and organized appearance.",
+"The surroundings include a large bedroom with a white color scheme, a bed with a white comforter, and a window. There is also a ceiling fan, which is a white fan, and a chair in the room. The room appears to be clean and well-maintained.",
+"In the image, there is a green couch, a green chair, and a green ottoman in a living room. The room is filled with books, suggesting that it is a cozy and well-read space.",
+"The surroundings include a large bedroom with a large bed, a chair, and a desk. The room also has a window, a lamp, and a mirror. The bed is neatly made, and there are pillows on it. The room is well-lit, with a lamp providing illumination.",
+"In the image, there is a living room with a couch, a chair, and a table. The room has a modern design, featuring a large window and a chandelier. The living room is filled with furniture, including a couch, a chair, and a table. The room also has a potted plant, which adds a touch of greenery and a sense of freshness to the space. The living room is well-lit, with the large window providing ample natural light, and the chandelier adding a touch of elegance and sophistication.",
+"I am in a living room with a fireplace, a couch, a chair, a dining table, and a potted plant. The room is filled with furniture and decorations, creating a cozy and inviting atmosphere.",
+"In the image, there is a living room with a couch, two chairs, and a coffee table. The living room is well-lit and has a view of the city, which adds to the ambiance of the space. The room also features a potted plant and a vase, adding a touch of greenery and decoration to the area.",
+"In the image, there is a neatly made bed in a bedroom, with a white comforter and a red blanket. The bed is situated next to a window, which allows natural light to enter the room. The room also has a nightstand with a lamp, providing additional lighting. The overall atmosphere of the room is clean and inviting.",
+"In the image, I am surrounded by a large, clean living room with white walls, a fireplace, and a comfortable couch. There are also several chairs and a dining table in the room. The space is well-lit, and the furniture is arranged to create a cozy and inviting atmosphere.",
+"The surroundings in the image include a living room with a couch, chairs, and a coffee table. The living room is filled with furniture, and there are multiple lamps and potted plants scattered throughout the space. The room also has a window, which allows natural light to enter the room.",
+"The surroundings include a living room with a couch, a coffee table, and a lamp. The room also has a large window, which allows for natural light to enter. There are several chairs and a dining table in the room, suggesting that it is a multi-purpose space for relaxation and dining. The living room is well-lit and furnished with comfortable seating options, creating a welcoming atmosphere for people to gather and socialize.",
+"In the image, I am surrounded by a living room filled with furniture, including a couch, chairs, and a coffee table. The living room is well-decorated, and there are several books and a vase present. The room also features a rug, which adds to the overall aesthetic and comfort of the space.",
+"The surroundings include a messy room with a bed, a desk, and a chair. The room is filled with clothes, shoes, and other items, creating a cluttered and disorganized space.",
+"The surroundings in the image include a cluttered room with a desk, a bed, and various items scattered around. The room appears to be messy and disorganized, with clothes and other belongings scattered on the floor.",
+"The surroundings include a group of people sitting on a bed, with a laptop and a cell phone visible. The room appears to be a bedroom, and the individuals are engaged in a conversation.",
+"I am in a living room, surrounded by several people sitting on a couch. They are all engaged in various activities, such as watching TV, using their cell phones, and possibly playing video games. The room is filled with furniture, including a couch, chairs, and a TV. The atmosphere appears to be casual and relaxed, with the people enjoying their time together in the living room.",
+"The surroundings include a group of people sitting on a couch, with a wooden table in the background. The room appears to be a living room, and there is a window nearby.",
+"In the image, there are several people sitting on a couch, using their cell phones. The couch is located in a living room, and the people are engaged in various activities on their devices.",
+"The surroundings include a living room with a fireplace, where a group of people is sitting on couches and chairs. There are multiple books scattered around the room, suggesting that the individuals might be engaged in reading or studying. The room also has a dining table and a potted plant, which adds to the cozy atmosphere of the space.",
+"The surroundings include a living room with a couch, a dining table, and a pizza on the table. The people in the room are sitting and enjoying their meal together.",
+"In the image, there are four people sitting on a bed, with two of them facing the camera. They are all wearing blue shirts and are engaged in a conversation. The scene takes place in a bedroom, which is a comfortable and familiar setting for the group.",
+"The surroundings include a group of people sitting on a couch, with some of them holding pizza boxes. The room appears to be a living room, and there is a book nearby.",
+"In the image, there are four people sitting on a couch in a living room. They are all engaged in using their cell phones, with one of them holding a book. The room has a wooden floor and a table, and there are chairs nearby. The atmosphere appears to be casual and relaxed, with the group of friends enjoying their time together while using their devices.",
+"I am in a living room, which is filled with furniture such as a couch, a chair, and a table. The room is well-lit and appears to be a comfortable space for relaxation and socializing.",
+"The surroundings include a living room with a couch, a table, and a chair. There are people sitting on the couch and a man standing in the room.",
+"The surroundings include a group of people sitting on a couch, likely in a living room or a similar space. They are engaged in a conversation or enjoying each other's company.",
+"The surroundings include a living room with a couch, a chair, and a table. There are several people sitting on the couch and chairs, engaging in conversation and enjoying each other's company. The room appears to be well-lit, creating a comfortable atmosphere for socializing.",
+"In the image, there is a group of people sitting around a dining table in a room. The table is covered with various items, including cups, bowls, and a vase. The room appears to be a living room or a dining area, with a couch and chairs nearby. The scene suggests a casual and comfortable setting where people are gathered for a meal or a social event.",
+"The surroundings include a living room with a couch, a dining table, and a TV. The room is filled with people, some of whom are sitting on the couch, while others are standing around the table. There are also several bottles and cups, which might be used for drinking. The atmosphere appears to be relaxed and social, with people enjoying each other's company and engaging in conversations.",
+"The surroundings include a living room with a couch, a coffee table, and a window. There are three people sitting on the couch, engaging in a conversation.",
+"The surroundings include a living room with a couch, a table, and a couple of chairs. There are also several bottles and cups on the table, suggesting that the room is set up for a casual gathering or a social event.",
+"The surroundings include a living room with a couch, chairs, and a coffee table. There are also books scattered around the room, suggesting that the group of people might be engaged in a discussion or reading. The room appears to be cozy and comfortable, with a relaxed atmosphere.",
+"The surroundings include a living room with a couch, a television, and a group of people sitting together."]
+cities = ["Wake Forest, NC", "Rocky Mount, NC", "San Francisco, CA", "New York City, NY", "Trenton, NJ", "Philadelphia, PA",
+          "Vikas Puri, New Delhi", "Jiugong, Beijing", "Les Halles, Paris", "Diemen, Amsterdam", "Al Shamkhah, Abu Dhabi",
+          "Cairo", "Idore, Madhya Pradesh", "Bangalore, Karnataka", "Toronoto, Ontario", "Brixton, London", "Charlotte, NC",
+          "Los Angeles, CA", "Las Vegas, NV", "Cupertino, CA", "Silicon Valley, CA", "Sham Shui Po, Hong Kong", "Danilovsky District, Moscow",
+          "Rochester, NY", "Manhattan, NY"]
+weather_names = ["Sunny", "Rainy", "Windy", "Cloudy",
+                 "Mostly Cloudy", "Partly Cloudy", "Light Rain", "Sleet"]
+days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
+months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+dummy_time = f"{random.choice(days)} {random.randint(1, 30)} {random.choice(months)} {random.randint(2020, 2024)} 0{random.randint(1, 9)}:{random.randint(0, 59)} {random.choice(['AM', 'PM'])}"
+dummy_weather = f"{random.choice(cities)} is {random.choice(weather_names)} with {random.randint(60, 95)}°F and Precipitation: {random.randint(0, 100)}%, Humidity: {random.randint(0, 100)}%, Wind: {random.randint(1, 15)} mph"
+dummy_current_events = random.choice(room_descriptions)
+@app.route("/")
+def home():
+    vqa = file_data.count("> VQA")
+    robot = file_data.count("> VQA")
+    internet = file_data.count("> VQA")
+    cli = file_data.count("> VQA")
+    note = file_data.count("> VQA")
+    home_automation = file_data.count("> VQA")
+    header = f"""VQA: {vqa}
+ROBOT: {robot}
+INTERNET: {internet}
+CLI: {cli}
+NOTE: {note}
+HOME AUTOMATION: {home_automation}
+"""
+    return render_template("index.html", full_data=header + json.dumps(data, indent=4), data_entries=len(data), dummy_time=dummy_time, dummy_weather=dummy_weather, dummy_current_events=dummy_current_events[:-1])
+@app.route('/record', methods=['GET', 'POST'])
+def record():
+    if request.method == "POST":
+        with open(data_file_path, 'r') as data:
+            data = json.loads(data.read())
+        dummy_time = f"{random.choice(days)} {random.randint(1, 30)} {random.choice(months)} {random.randint(2020, 2024)} 0{random.randint(1, 9)}:{random.randint(0, 59)} {random.choice(['AM', 'PM'])}"
+        dummy_weather = f"{random.choice(cities)} is {random.choice(weather_names)} with {random.randint(60, 95)}°F and Precipitation: {random.randint(0, 100)}%, Humidity: {random.randint(0, 100)}%, Wind: {random.randint(1, 15)} mph"
+        dummy_current_events = random.choice(room_descriptions)
+        entry = request.form["current-data-preview"]
+        input_field = request.form["input"]
+        entry = {
+            "prompt": entry.split(entry.split(input_field)[-1])[0],
+            "response": entry.split(input_field)[-1][2:],
+        }
+        data.append(entry)
+        with open(data_file_path, 'w+') as file:
+            file.write(str(json.dumps(data, indent=4)).replace("\r\n", "\n"))
+        with open(data_file_path, "r") as data_file:
+            file_data = data_file.read()
+        vqa = file_data.count("> VQA")
+        robot = file_data.count("> Robot")
+        internet = file_data.count("> Internet")
+        cli = file_data.count("> CLI")
+        note = file_data.count("> Note")
+        home_automation = file_data.count("> Home Automation")
+        header = f"""VQA: {vqa}
+ROBOT: {robot}
+INTERNET: {internet}
+CLI: {cli}
+NOTE: {note}
+HOME AUTOMATION: {home_automation}
+"""
+    return render_template("index.html", full_data=header + json.dumps(data, indent=4), data_entries=len(data), dummy_time=dummy_time, dummy_weather=dummy_weather, dummy_current_events=dummy_current_events[:-1])
+if __name__ == "__main__":
+    app.run(host="0.0.0.0")

Perceptrix/create_data/static/style.css ADDED Viewed

	@@ -0,0 +1,154 @@

+@import url('https://fonts.cdnfonts.com/css/bitsumishi');
+h1 {
+    font-family: "Bitsumishi", sans-serif;
+    font-size: 50px;
+    color: rgb(255, 255, 255);
+    margin: 10px;
+    letter-spacing: 3px;
+}
+body {
+    text-align: center;
+    background-color: rgb(0, 0, 0);
+    display: flex;
+    flex-direction: column;
+    justify-content: center;
+    margin: auto;
+    margin-top: 20px;
+    overflow: auto;
+    font-family: "Bitsumishi", sans-serif;
+}
+.entries {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: space-evenly;
+    height: 90vh;
+    width: fit-content;
+}
+.entry {
+    width: 20vw;
+    height: 25vh;
+    border: 2px solid rgb(32, 161, 236);
+    font-family: "Bitsumishi", sans-serif;
+    background-color: transparent;
+    border-top-left-radius: 30px;
+    border-bottom-right-radius: 30px;
+    padding: 25px;
+    resize: none;
+    outline: none;
+    color: white;
+    font-size: medium;
+    letter-spacing: 1px;
+}
+.main {
+    display: flex;
+    width: 90vw;
+    justify-content: space-between;
+    align-items: center;
+    margin: auto;
+}
+#submit {
+    background-color: transparent;
+    font-family: "Bitsumishi", sans-serif;
+    height: fit-content;
+    padding: 10px;
+    border: 2px solid rgb(32, 161, 236);
+    color: rgb(32, 161, 236);
+    font-size: medium;
+    cursor: pointer;
+    border-top-left-radius: 10px;
+    border-bottom-right-radius: 10px;
+    box-shadow: 0px 0px 100px 5px rgb(0, 0, 0) inset;
+    transition: 0.5s;
+    margin-top: 10px;
+    width: 50%;
+}
+#submit:hover {
+    box-shadow: 0px 0px 20px 5px rgb(33, 77, 255);
+    transition: 0.5s;
+}
+.other-inputs {
+    padding: 10px;
+    outline: none;
+    border: 2px solid rgb(32, 161, 236);
+    border-radius: 10px;
+    font-size: medium;
+    background-color: transparent;
+    font-family: "Bitsumishi", sans-serif;
+    width: 100%;
+    color: white;
+}
+.control {
+    display: flex;
+    flex-direction: column;
+    justify-content: space-evenly;
+    text-align: left;
+    margin-left: 100px;
+}
+.data-preview {
+    margin-top: 25px;
+}
+.data{
+    background-color:rgb(22, 22, 22);
+    padding: 25px;
+    width: 20vw;
+    height: 15vw;
+    color: rgb(255, 255, 255);
+    border-radius: 10px;
+    overflow: scroll;
+}
+.other-fields{
+    display: flex;
+    flex-direction: column;
+    width: 20%;
+    justify-content: space-around;
+    height: 32vh;
+}
+.all-inputs{
+    display: flex;
+    width: 63vw;
+    justify-content: space-between;
+    font-weight: 100;
+}
+.center-input{
+    padding: 10px;
+    outline: none;
+    border: 2px solid rgb(32, 161, 236);
+    border-radius: 10px;
+    font-size: medium;
+    background-color: transparent;
+    font-family: "Bitsumishi", sans-serif;
+    width: 45%;
+    color: white;
+}
+.center-inputs{
+    display: flex;
+    width: 63vw;
+    justify-content: space-between;
+}
+#output{
+    width: 60vw
+}
+#current-data-preview{
+    outline: none;
+    border: none;
+    resize: none;
+    font-size: 14px;
+}

Perceptrix/create_data/templates/index.html ADDED Viewed

	@@ -0,0 +1,80 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Fine-tune Data CRYSTAL</title>
+    <link rel="stylesheet" href="{{url_for('static',filename='style.css')}}">
+</head>
+<body>
+    <h1>FINETUNE CRYSTAL</h1>
+    <form action="/record" method="post">
+        <div class="main">
+            <div class="entries">
+                <div class="all-inputs">
+                    <textarea class="entry" name="notes" id="notes" placeholder="Enter Additional Notes Here:" oninput="updatePreview()">Notes- </textarea>
+                    <textarea class="entry" name="input" id="input" placeholder="Enter Input Here:" oninput="updatePreview()"></textarea>
+                    <div class="other-fields">
+                        <input type="text" class="other-inputs" name="user" id="user" placeholder="User Name" oninput="updatePreview()">
+                        <input type="text" class="other-inputs" name="time" id="time" placeholder="Time" oninput="updatePreview()">
+                        <input type="text" class="other-inputs" name="weather" id="weather" placeholder="Weather" oninput="updatePreview()">
+                        <input type="text" class="other-inputs" name="action" id="action" placeholder="Action" oninput="updatePreview()">
+                    </div>
+                </div>
+                <div class="center-inputs">
+                    <input type="text" class="center-input" name="current-events" id="current-events" placeholder="Current Events" oninput="updatePreview()">
+                    <input type="text" class="center-input" name="speak" id="speak" placeholder="Speak" oninput="updatePreview()">
+                </div>
+                <textarea class="entry" name="output" id="output" placeholder="Enter Output Here:" oninput="updatePreview()"></textarea>
+            </div>
+            <div class="control">
+                <input id="submit" type="submit" value="ADD TO DATABASE">
+                <div class="data-preview">
+                    <label style="color: white; font-size: x-large;" for="preview">Data Preview</label>
+                    <textarea id="current-data-preview" name="current-data-preview" class="data"></textarea>
+                    <p style="color: rgb(143, 143, 143); font-size: medium; margin: 5px;"> Data Entries: {{data_entries}}</p>
+                    <pre class="data">{{full_data}}</pre>
+                </div>
+            </div>
+        </div>
+    </form>
+    <script>
+        var user = document.querySelector('#user');
+        var time = document.querySelector('#time');
+        var weather = document.querySelector('#weather');
+        var action = document.querySelector('#action');
+        var current_events = document.querySelector('#current-events');
+        var input = document.querySelector('#input');
+        var output = document.querySelector('#output');
+        var speak = document.querySelector('#speak');
+        var notes = document.querySelector('#notes');
+        var preview = document.querySelector('#current-data-preview');
+        time.value = "{{dummy_time}}";
+        weather.value = "{{dummy_weather}}";
+        current_events.value = "{{dummy_current_events}}";
+        function updatePreview() {
+            format = "Time- {time}\nWeather- {weather}\nSurroundings- {current_events}\n{notes}\n{user}: {input}\nCRYSTAL:<###CRYSTAL-INTERNAL###> Speak\n{speak}\n<###CRYSTAL-INTERNAL###> {action}\n{output}"
+            var formattedText = format.replace('{time}', time.value)
+                                      .replace('{weather}', weather.value)
+                                      .replace('{current_events}', current_events.value)
+                                      .replace('{user}', user.value)
+                                      .replace('{input}', input.value)
+                                      .replace('{action}', action.value)
+                                      .replace('{speak}', speak.value)
+                                      .replace('{notes}', notes.value)
+                                      .replace('{output}', output.value);
+            preview.textContent = formattedText;
+        }
+        // Initial preview update
+        updatePreview();
+    </script>
+</body>
+</html>

Perceptrix/engine.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, BitsAndBytesConfig
+from Perceptrix.streamer import TextStreamer
+from utils import setup_device
+import torch
+import tqdm
+import sys
+import os
+model_name = os.environ.get('LLM_MODEL')
+model_id = "models/CRYSTAL-instruct" if model_name == None else model_name
+device = setup_device()
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+tokenizer = LlamaTokenizer.from_pretrained(
+    model_id,
+    use_fast=True)
+model = LlamaForCausalLM.from_pretrained(
+    model_id,
+    load_in_8bit=False,
+    device_map="auto",
+    torch_dtype=torch.float16,
+    low_cpu_mem_usage=True,
+    offload_folder="offload",
+    quantization_config=bnb_config,
+)
+streamer = TextStreamer(tokenizer, skip_prompt=True,
+                        skip_special_tokens=True, save_file="reply.txt")
+PROMPT = '''### Instruction:
+{}
+### Input:
+{}
+### Response:'''
+model.config.pad_token_id = tokenizer.pad_token_id = 0
+model.config.bos_token_id = 1
+model.config.eos_token_id = 2
+model.eval()
+if torch.__version__ >= "2" and sys.platform != "win32":
+    model = torch.compile(model)
+def evaluate(
+    prompt='',
+    temperature=0.4,
+    top_p=0.65,
+    top_k=35,
+    repetition_penalty=1.1,
+    max_new_tokens=512,
+    **kwargs,
+):
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(device)
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        **kwargs,
+    )
+    with torch.no_grad():
+        generation_output = model.generate(
+            input_ids=input_ids,
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+            output_scores=True,
+            max_new_tokens=max_new_tokens,
+            streamer=streamer,
+        )
+    s = generation_output.sequences[0]
+    output = tokenizer.decode(s)
+    yield output.split("### Response:")[-1].strip()
+def run_instruction(
+    instruction,
+    inputs,
+    temperature=0.4,
+    top_p=0.65,
+    top_k=35,
+    repetition_penalty=1.1,
+    max_new_tokens=512,
+    stop_tokens=None,
+):
+    now_prompt = PROMPT.format(instruction+'\n', inputs)
+    response = evaluate(
+        now_prompt, temperature, top_p, top_k, repetition_penalty, max_new_tokens, stop_tokens=stop_tokens, do_sample=True
+    )
+    for i in response:
+        yield i
+def search_keyword(prompt):
+    instructions = """Prompt:Time: Fri, 23 August 2023 2:30PM\nWeather: 73F\nHow many friends have I told you about?
+Search Keyword:Friends
+Prompt:Time: Thu, 27 September 2023 3:41PM\nWeather: 62F\nWhat was our very first conversation
+Chat Index:0
+Prompt:Time: Tue, 21 September 2023 2:30PM\nWeather: 67F\nWhat was the last thing I said to you
+Chat Index:-1
+Prompt:Time: Sun, 31 October 2023 7:33AM\nWeather: 59F\nWhat was the last thing I said to you before that
+Chat Index:-2
+Prompt:Time: Sat, 30 October 2023 8:21PM\nWeather: 65F\nDid I ever tell you about my math class?
+Search Keyword:math
+Prompt:Time: Mon, 13 November 2023 4:52PM\nWeather: 55F\nWhat was my 7th grade English teacher's name?
+Search Keyword:English
+Prompt:Time: Wed, 15 May 2023 6:19PM\nWeather: 80F\nWhere did I say my wallet was?
+Search Keyword:Wallet
+Prompt:Time: Fri, 24 June 2023 1:52PM\nWeather: 92F\nWhat did Alex tell you?
+Search Keyword:Alex
+Prompt:Time: Sat, 19 July 2023 2:44PM\nWeather: 91F\nWhat was my first conversation today
+Search Keyword:24 June"""
+    answer = ''.join(run_instruction(
+        instructions,
+        "Prompt:"+prompt+"\n",
+        temperature=0.5,
+        top_p=0.5,
+        top_k=200,
+        repetition_penalty=1.1,
+        max_new_tokens=256,
+    ))
+    return answer
+def identify_objects_from_text(prompt):
+    instructions = """Input:The object that flies in the air from this picture is a toy helicopter
+Output:Toy helicopter
+Input:For the robot to be able to achieve the task, the robot needs to look for a white shirt
+Output:White shirt
+Input:To complete the task, the robot needs to remove the fruits from the wooden basket.
+Output:fruits, wooden basket
+Input:To clean up your desk, you need to gather and organize the various items scattered around it. This includes the laptop, cell phone, scissors, pens, and other objects. By putting these items back in their designated spaces or containers, you can create a more organized and clutter-free workspace.
+Output:Laptop, cell phone, scissors, pens, containers
+Input:The tree with a colorful sky background is the one to be looking for.
+Output:Tree"""
+    answer = ''.join(run_instruction(
+        instructions,
+        prompt,
+        temperature=0.5,
+        top_p=0.5,
+        top_k=200,
+        repetition_penalty=1.1,
+        max_new_tokens=256,
+    ))
+    return answer
+def robotix(prompt, stop=None):
+    instructions = """#Get me some water
+objects = [['water: 57%', (781, 592)]]
+robot.target((781, 592))
+object_distance = distance()
+if object_distance > 10:
+    robot.go("forward", object_distance, track="water")
+robot.grab()
+if object_distance > 10:
+    robot.go("back", object_distance)
+robot.release("here")
+#Stand by the table
+objects = [['table: 81%', (1489, 1173)], ['table: 75%', (1971, 1293)]]
+robot.target((1489, 1173))
+if distance() > 10:
+    robot.go(forward, distance())
+#Put the apples in the basket
+objects = [['basket: 77%', (89, 112)], ['apples: 72%', (222, 182)]]
+robot.target((281, 189))
+if distance() > 10:
+    robot.go("forward", distance(), track="apples")
+robot.grab()
+robot.target(robot.find("basket"))
+robot.release(distance())
+#Go to the sofa
+objects=[['sofa: 81%', (1060, 931)]]
+robot.target((1060, 931))
+if distance() > 10:
+    robot.go("forward", distance())
+#Go to that person over there and then come back
+objects=[['person: 85%', (331, 354)]]
+robot.target((331, 354))
+object_distance = distance()
+if object_distance > 10:
+    robot.go("forward", object_distance)
+    robot.go("backward", object_distance)
+"""
+    answer = ''.join(run_instruction(
+        instructions,
+        prompt,
+        temperature=0.2,
+        top_p=0.5,
+        top_k=300,
+        repetition_penalty=1.1,
+        max_new_tokens=256,
+        stop_tokens=stop,
+    ))
+    return answer
+if __name__ == "__main__":
+    print(robotix("#Get me a glass of water\nobjects = [['water: 65%', (695, 234)]]"))

Perceptrix/finetune/Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+ARG BASE_IMAGE
+FROM $BASE_IMAGE
+ARG DEP_GROUPS
+# Install and uninstall foundry to cache foundry requirements
+RUN git clone -b main https://github.com/mosaicml/llm-foundry.git
+RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}"
+RUN pip uninstall -y llm-foundry
+RUN rm -rf llm-foundry

Perceptrix/finetune/Makefile ADDED Viewed

	@@ -0,0 +1,23 @@

+# several pytest settings
+WORLD_SIZE ?= 1  # world size for launcher tests
+MASTER_PORT ?= 26000 # port for distributed tests
+PYTHON ?= python3  # Python command
+PYTEST ?= pytest  # Pytest command
+PYRIGHT ?= pyright  # Pyright command. Pyright must be installed seperately -- e.g. `node install -g pyright`
+EXTRA_ARGS ?=  # extra arguments for pytest
+EXTRA_LAUNCHER_ARGS ?= # extra arguments for the composer cli launcher
+test:
+	LOCAL_WORLD_SIZE=1 $(PYTHON) -m $(PYTEST) $(EXTRA_ARGS)
+test-gpu:
+	LOCAL_WORLD_SIZE=1 $(PYTHON) -m $(PYTEST) -m gpu $(EXTRA_ARGS)
+# runs tests with the launcher
+test-dist:
+	$(PYTHON) -m composer.cli.launcher -n $(WORLD_SIZE) --master_port $(MASTER_PORT) $(EXTRA_LAUNCHER_ARGS) -m $(PYTEST) $(EXTRA_ARGS)
+test-dist-gpu:
+	$(PYTHON) -m composer.cli.launcher -n $(WORLD_SIZE) --master_port $(MASTER_PORT) $(EXTRA_LAUNCHER_ARGS) -m $(PYTEST) -m gpu $(EXTRA_ARGS)
+.PHONY: test test-gpu test-dist test-dist-gpu

Perceptrix/finetune/README.md ADDED Viewed

	@@ -0,0 +1,265 @@

+<!-- SETUPTOOLS_LONG_DESCRIPTION_HIDE_BEGIN -->
+<p align="center">
+  <a href="https://github.com/mosaicml/llm-foundry">
+    <picture>
+      <img alt="LLM Foundry" src="./assets/llm-foundry.png" width="95%">
+    </picture>
+  </a>
+</p>
+<!-- SETUPTOOLS_LONG_DESCRIPTION_HIDE_END -->
+<p align="center">
+    <a href="https://pypi.org/project/llm-foundry/">
+        <img alt="PyPi Version" src="https://img.shields.io/pypi/pyversions/llm-foundry">
+    </a>
+    <a href="https://pypi.org/project/llm-foundry/">
+        <img alt="PyPi Package Version" src="https://img.shields.io/pypi/v/llm-foundry">
+    </a>
+    <a href="https://mosaicml.me/slack">
+        <img alt="Chat @ Slack" src="https://img.shields.io/badge/slack-chat-2eb67d.svg?logo=slack">
+    </a>
+    <a href="https://github.com/mosaicml/llm-foundry/blob/main/LICENSE">
+        <img alt="License" src="https://img.shields.io/badge/License-Apache%202.0-green.svg">
+    </a>
+</p>
+<br />
+# LLM Foundry
+This repository contains code for training, finetuning, evaluating, and deploying LLMs for inference with [Composer](https://github.com/mosaicml/composer) and the [MosaicML platform](https://forms.mosaicml.com/demo?utm_source=github.com&utm_medium=referral&utm_campaign=llm-foundry). Designed to be easy-to-use, efficient _and_ flexible, this codebase is designed to enable rapid experimentation with the latest techniques.
+You'll find in this repo:
+* `llmfoundry/` - source code for models, datasets, callbacks, utilities, etc.
+* `scripts/` - scripts to run LLM workloads
+  * `data_prep/` - convert text data from original sources to StreamingDataset format
+  * `train/` - train or finetune HuggingFace and MPT models from 125M - 70B parameters
+    * `train/benchmarking` - profile training throughput and MFU
+  * `inference/` - convert models to HuggingFace or ONNX format, and generate responses
+    * `inference/benchmarking` - profile inference latency and throughput
+  * `eval/` - evaluate LLMs on academic (or custom) in-context-learning tasks
+* `mcli/` - launch any of these workloads using [MCLI](https://docs.mosaicml.com/projects/mcli/en/latest/) and the [MosaicML platform](https://www.mosaicml.com/platform)
+* `TUTORIAL.md` - a deeper dive into the repo, example workflows, and FAQs
+# MPT
+Mosaic Pretrained Transformers (MPT) are GPT-style models with some special features -- Flash Attention for efficiency, ALiBi for context length extrapolation, and stability improvements to mitigate loss spikes. As part of MosaicML's Foundation series, we have open-sourced several MPT models:
+| Model              | Context Length | Download                                           | Demo                                                        | Commercial use? |
+| ------------------ | -------------- | -------------------------------------------------- | ----------------------------------------------------------- | --------------- |
+| MPT-30B            | 8192           | https://huggingface.co/mosaicml/mpt-30b            |                                                             | Yes             |
+| MPT-30B-Instruct   | 8192           | https://huggingface.co/mosaicml/mpt-30b-instruct   |                                                             | Yes             |
+| MPT-30B-Chat       | 8192           | https://huggingface.co/mosaicml/mpt-30b-chat       | [Demo](https://huggingface.co/spaces/mosaicml/mpt-30b-chat) | No              |
+| MPT-7B             | 2048           | https://huggingface.co/mosaicml/mpt-7b             |                                                             | Yes             |
+| MPT-7B-Instruct    | 2048           | https://huggingface.co/mosaicml/mpt-7b-instruct    |                                                             | Yes             |
+| MPT-7B-Chat        | 2048           | https://huggingface.co/mosaicml/mpt-7b-chat        | [Demo](https://huggingface.co/spaces/mosaicml/mpt-7b-chat)  | No              |
+| MPT-7B-StoryWriter | 65536          | https://huggingface.co/mosaicml/mpt-7b-storywriter |                                                             | Yes             |
+To try out these models locally, [follow the instructions](https://github.com/mosaicml/llm-foundry/tree/main/scripts/inference#interactive-generation-with-modelgenerate) in `scripts/inference/README.md` to prompt HF models using our [hf_generate.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/inference/hf_generate.py) or [hf_chat.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/inference/hf_chat.py) scripts.
+# MPT Community
+We've been overwhelmed by all the amazing work the community has put into MPT! Here we provide a few links to some of them:
+* [ReplitLM](https://github.com/replit/replitLM): `replit-code-v1-3b` is a 2.7B Causal Language Model focused on Code Completion. The model has been trained on a subset of the Stack Dedup v1.2 dataset covering 20 languages such as Java, Python, and C++
+* [LLaVa-MPT](https://github.com/haotian-liu/LLaVA#LLaVA-MPT-7b): Visual instruction tuning to get MPT multimodal capabilities
+* [ggml](https://github.com/ggerganov/ggml/tree/master): Optimized MPT version for efficient inference on consumer hardware
+* [GPT4All](https://gpt4all.io/index.html): locally running chat system, now with MPT support!
+* [Q8MPT-Chat](https://huggingface.co/spaces/Intel/Q8-Chat): 8-bit optimized MPT for CPU by our friends at Intel
+Tutorial videos from the community:
+* [Using MPT-7B with Langchain](https://www.youtube.com/watch?v=DXpk9K7DgMo&t=3s) by [@jamesbriggs](https://www.youtube.com/@jamesbriggs)
+* [MPT-7B StoryWriter Intro](https://www.youtube.com/watch?v=O9Y_ZdsuKWQ) by [AItrepreneur](https://www.youtube.com/@Aitrepreneur)
+* [Fine-tuning MPT-7B on a single GPU](https://www.youtube.com/watch?v=KSlWkrByc0o&t=9s) by [@AIology2022](https://www.youtube.com/@AIology2022)
+* [How to Fine-tune MPT-7B-Instruct on Google Colab](https://youtu.be/3de0Utr9XnI) by [@VRSEN](https://www.youtube.com/@vrsen)
+Something missing? Contribute with a PR!
+# Latest News
+* [Blog: MPT-30B: Raising the bar for open-source foundation models](https://www.mosaicml.com/blog/mpt-30b)
+* [Blog: Introducing MPT-7B](https://www.mosaicml.com/blog/mpt-7b)
+* [Blog: Benchmarking LLMs on H100](https://www.mosaicml.com/blog/coreweave-nvidia-h100-part-1)
+* [Blog: Blazingly Fast LLM Evaluation](https://www.mosaicml.com/blog/llm-evaluation-for-icl)
+* [Blog: GPT3 Quality for $500k](https://www.mosaicml.com/blog/gpt-3-quality-for-500k)
+* [Blog: Billion parameter GPT training made easy](https://www.mosaicml.com/blog/billion-parameter-gpt-training-made-easy)
+# Hardware and Software Requirements
+This codebase has been tested with PyTorch 1.13.1 and PyTorch 2.0.1 on systems with NVIDIA A100s and H100s.
+This codebase may also work on systems with other devices, such as consumer NVIDIA cards and AMD cards, but we are not actively testing these systems.
+If you have success/failure using LLM Foundry on other systems, please let us know in a Github issue and we will update the support matrix!
+| Device         | Torch Version | Cuda Version | Status                       |
+| -------------- | ------------- | ------------ | ---------------------------- |
+| A100-40GB/80GB | 1.13.1        | 11.7         | :white_check_mark: Supported |
+| A100-40GB/80GB | 2.0.1         | 11.7, 11.8   | :white_check_mark: Supported |
+| A100-40GB/80GB | 2.1.0         | 11.8, 12.1   | :white_check_mark: Supported |
+| H100-80GB      | 1.13.1        | 11.7         | :x: Not Supported            |
+| H100-80GB      | 2.0.1         | 11.8         | :white_check_mark: Supported |
+| H100-80GB      | 2.1.0         | 12.1         | :white_check_mark: Supported |
+| A10-24GB       | 1.13.1        | 11.7         | :construction: In Progress   |
+| A10-24GB       | 2.0.1         | 11.7, 11.8   | :construction: In Progress   |
+| MI250          | 2.0.1         | ROCm 5.4     | :construction: In Progress   |
+## MosaicML Docker Images
+We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories.
+The `mosaicml/pytorch` images are pinned to specific PyTorch and CUDA versions, and are stable and rarely updated.
+The `mosaicml/llm-foundry` images are built with new tags upon every commit to the `main` branch.
+You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117-f678575` or take the latest one using `mosaicml/llm-foundry:1.13.1_cu117-latest`.
+**Please Note:** The `mosaicml/llm-foundry` images do not come with the `llm-foundry` package preinstalled, just the dependencies. You will still need to `pip install llm-foundry` either from PyPi or from source.
+| Docker Image                                           | Torch Version | Cuda Version      | LLM Foundry dependencies installed? |
+| ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
+| `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | 1.13.1        | 11.7 (Infiniband) | No                                  |
+| `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04`  | 2.0.1         | 11.8 (Infiniband) | No                                  |
+| `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04`  | 2.1.0         | 12.1 (Infiniband) | No                                  |
+| `mosaicml/llm-foundry:1.13.1_cu117-latest`             | 1.13.1        | 11.7 (Infiniband) | Yes                                 |
+| `mosaicml/llm-foundry:2.0.1_cu118-latest`              | 2.0.1         | 11.8 (Infiniband) | Yes                                 |
+| `mosaicml/llm-foundry:2.1.0_cu121-latest`              | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v1)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`       | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v2)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_aws-latest`          | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v1)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_flash2_aws-latest`   | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v2)            |
+# Installation
+This assumes you already have PyTorch and CMake installed.
+To get started, clone the repo and set up your environment. Instructions to do so differ slightly depending on whether you're using Docker.
+### With Docker (recommended)
+We *strongly* recommend working with LLM Foundry inside a Docker container (see our recommended Docker image above). If you are doing so, follow these steps to clone the repo and install the requirements.
+<!--pytest.mark.skip-->
+```bash
+git clone https://github.com/mosaicml/llm-foundry.git
+cd llm-foundry
+pip install -e ".[gpu]"  # or pip install -e . if no NVIDIA GPU
+```
+### Without Docker (not recommended)
+If you choose not to use Docker, you should create and use a virtual environment.
+<!--pytest.mark.skip-->
+```bash
+git clone https://github.com/mosaicml/llm-foundry.git
+cd llm-foundry
+# Creating and activate a virtual environment
+python3 -m venv llmfoundry-venv
+source llmfoundry-venv/bin/activate
+pip install cmake packaging torch  # setup.py requires these be installed
+pip install -e ".[gpu]"  # or pip install -e . if no NVIDIA GPU
+```
+### TransformerEngine and amp_fp8 support
+NVIDIA H100 GPUs have FP8 support; this additionally requires the following installations:
+<!--pytest.mark.skip-->
+```bash
+pip install flash-attn==1.0.7 --no-build-isolation
+pip install git+https://github.com/NVIDIA/[email protected]
+```
+See [here](https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#TransformerEngine-and-amp_fp8-support) for more details on enabling TransformerEngine layers and amp_fp8.
+### AMD (BETA support)
+In [our testing of AMD GPUs](https://www.mosaicml.com/blog/amd-mi250), the env setup includes:
+<!--pytest.mark.skip-->
+```bash
+git clone https://github.com/mosaicml/llm-foundry.git
+cd llm-foundry
+# Creating and activate a virtual environment
+python3 -m venv llmfoundry-venv-amd
+source llmfoundry-venv-amd/bin/activate
+# installs
+pip install cmake packaging torch
+pip install -e .  # This installs some things that are not needed but they don't hurt
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.4.2
+```
+**Lastly**, install the ROCm enabled flash attention (instructions [here](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm2#amd-gpurocm-support)).
+Notes:
+1. `attn_impl: triton` does not work.
+1. We don't yet have a docker img where everything works perfectly. You might need to up/downgrade some packages (in our case, we needed to downgrade to `numpy==1.23.5`) before everything works without issue.
+# Quickstart
+> **Note**
+> Make sure to go through the installation steps above before trying the quickstart!
+Here is an end-to-end workflow for preparing a subset of the C4 dataset, training an MPT-125M model for 10 batches,
+converting the model to HuggingFace format, evaluating the model on the Winograd challenge, and generating responses to prompts.
+**(Remember this is a quickstart just to demonstrate the tools -- To get good quality, the LLM must be trained for longer than 10 batches 😄)**
+<!--pytest.mark.skip-->
+```bash
+cd scripts
+# Convert C4 dataset to StreamingDataset format
+python data_prep/convert_dataset_hf.py \
+  --dataset c4 --data_subset en \
+  --out_root my-copy-c4 --splits train_small val_small \
+  --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
+# Train an MPT-125m model for 10 batches
+composer train/train.py \
+  train/yamls/pretrain/mpt-125m.yaml \
+  data_local=my-copy-c4 \
+  train_loader.dataset.split=train_small \
+  eval_loader.dataset.split=val_small \
+  max_duration=10ba \
+  eval_interval=0 \
+  save_folder=mpt-125m
+# Convert the model to HuggingFace format
+python inference/convert_composer_to_hf.py \
+  --composer_path mpt-125m/ep0-ba10-rank0.pt \
+  --hf_output_path mpt-125m-hf \
+  --output_precision bf16 \
+  # --hf_repo_for_upload user-org/repo-name
+# Evaluate the model on a subset of tasks
+composer eval/eval.py \
+  eval/yamls/hf_eval.yaml \
+  icl_tasks=eval/yamls/copa.yaml \
+  model_name_or_path=mpt-125m-hf
+# Generate responses to prompts
+python inference/hf_generate.py \
+  --name_or_path mpt-125m-hf \
+  --max_new_tokens 256 \
+  --prompts \
+    "The answer to life, the universe, and happiness is" \
+    "Here's a quick recipe for baking chocolate chip cookies: Start by"
+```
+Note: the `composer` command used above to train the model refers to [Composer](https://github.com/mosaicml/composer) library's distributed launcher.
+If you have a write-enabled [HuggingFace auth token](https://huggingface.co/docs/hub/security-tokens), you can optionally upload your model to the Hub! Just export your token like this:
+```bash
+export HUGGING_FACE_HUB_TOKEN=your-auth-token
+```
+and uncomment the line containing `--hf_repo_for_upload ...` in the above call to `inference/convert_composer_to_hf.py`.
+# Learn more about LLM Foundry!
+Check out [TUTORIAL.md](https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md) to keep learning about working with LLM Foundry. The tutorial highlights example workflows, points you to other resources throughout the repo, and answers frequently asked questions!
+# Contact Us
+If you run into any problems with the code, please file Github issues directly to this repo.
+If you want to train LLMs on the MosaicML platform, reach out to us at [[email protected]](mailto:[email protected])!

Perceptrix/finetune/build/lib/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+__all__ = []

Perceptrix/finetune/build/lib/inference/convert_composer_mpt_to_ft.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+# Note: This script is specifically for converting MPT Composer checkpoints to FasterTransformer format.
+import configparser
+import os
+import tempfile
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+import torch
+from composer.utils import get_file, safe_torch_load
+from transformers import PreTrainedTokenizer
+from llmfoundry.utils import (convert_and_save_ft_weights,
+                              get_hf_tokenizer_from_composer_state_dict)
+def save_ft_config(composer_config: Dict[str, Any],
+                   tokenizer: PreTrainedTokenizer,
+                   save_dir: str,
+                   infer_gpu_num: int = 1,
+                   weight_data_type: str = 'fp32',
+                   force: bool = False):
+    config = configparser.ConfigParser()
+    config['gpt'] = {}
+    try:
+        config['gpt']['model_name'] = 'mpt'
+        config['gpt']['head_num'] = str(composer_config['n_heads'])
+        n_embd = composer_config['d_model']
+        config['gpt']['size_per_head'] = str(n_embd //
+                                             composer_config['n_heads'])
+        config['gpt']['inter_size'] = str(n_embd * composer_config['mlp_ratio'])
+        config['gpt']['max_pos_seq_len'] = str(composer_config['max_seq_len'])
+        config['gpt']['num_layer'] = str(composer_config['n_layers'])
+        config['gpt']['vocab_size'] = str(composer_config['vocab_size'])
+        config['gpt']['start_id'] = str(tokenizer.bos_token_id)
+        config['gpt']['end_id'] = str(tokenizer.eos_token_id)
+        config['gpt']['weight_data_type'] = weight_data_type
+        config['gpt']['tensor_para_size'] = str(infer_gpu_num)
+        # nn.LayerNorm default eps is 1e-5
+        config['gpt']['layernorm_eps'] = str(1e-5)
+        if composer_config['alibi']:
+            config['gpt']['has_positional_encoding'] = str(False)
+            config['gpt']['use_attention_linear_bias'] = str(True)
+        if composer_config['attn_clip_qkv'] and not force:
+            raise RuntimeError(
+                'clip_qkv is enabled for this MPT model. This may not work as expected in FT. Use --force to force a conversion.'
+            )
+        if composer_config['attn_qk_ln'] and not force:
+            raise RuntimeError(
+                'qk_ln is enabled for this MPT model. This may not work as expected in FT. Use --force to force a conversion.'
+            )
+        with open(os.path.join(save_dir, 'config.ini'), 'w') as configfile:
+            config.write(configfile)
+        return config
+    except:
+        print(f'Failed to save the config in config.ini.')
+        raise
+def write_ft_checkpoint_from_composer_checkpoint(
+        checkpoint_path: Union[Path, str],
+        infer_gpu_num: int,
+        save_dir: str,
+        output_precision: str = 'fp32',
+        local_checkpoint_save_location: Optional[Union[Path,
+                                                       str]] = None) -> None:
+    """Convert a Composer checkpoint to a FasterTransformer checkpoint folder.
+    .. note:: This function may not work properly if you used surgery algorithms when you trained your model. In that case you may need to
+        edit the parameter conversion methods to properly convert your custom model.
+    Args:
+        checkpoint_path (Union[Path, str]): Path to the composer checkpoint, can be a local path, or a remote path beginning with ``s3://``, or another backend
+            supported by Composer.
+        infer_gpu_num (int): The number of gpus you are planning to use for inference.
+        save_dir (str): Path of the directory to save the checkpoint in FT format.
+        output_precision (str, optional): The precision of the output weights saved to the FasterTransformer model. Can be either ``fp32`` or ``fp16``.
+        local_checkpoint_save_location (Optional[Union[Path, str]], optional): If specified, where to save the checkpoint file to locally.
+                                                                                If the input ``checkpoint_path`` is already a local path, this will be a symlink.
+                                                                   Defaults to None, which will use a temporary file.
+    """
+    dtype = {
+        'fp32': torch.float32,
+        'fp16': torch.float16,
+    }[output_precision]
+    # default local path to a tempfile if path is not provided
+    if local_checkpoint_save_location is None:
+        tmp_dir = tempfile.TemporaryDirectory()
+        local_checkpoint_save_location = Path(
+            tmp_dir.name) / 'local-composer-checkpoint.pt'
+    # download the checkpoint file
+    print(
+        f'Downloading checkpoint from {checkpoint_path} -> {local_checkpoint_save_location}'
+    )
+    get_file(str(checkpoint_path), str(local_checkpoint_save_location))
+    # Load the Composer checkpoint. Use it to get the
+    # Composer state dict and weights
+    print('Loading checkpoint into CPU RAM...')
+    composer_state_dict = safe_torch_load(local_checkpoint_save_location)
+    # Extract Composer config from state dict
+    if 'state' not in composer_state_dict:
+        raise RuntimeError(
+            f'"state" is not an available key in the provided composer checkpoint. Is {local_checkpoint_save_location} ill-formed?'
+        )
+    if 'integrations' not in composer_state_dict[
+            'state'] or 'huggingface' not in composer_state_dict['state'][
+                'integrations']:
+        raise RuntimeError(
+            'Did not find HuggingFace related state (e.g., tokenizer) in the provided composer checkpoint!'
+        )
+    composer_config = composer_state_dict['state']['integrations'][
+        'huggingface']['model']['config']['content']
+    # Extract the HF tokenizer
+    print('#' * 30)
+    print('Extracting HF Tokenizer...')
+    hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
+        composer_state_dict)
+    if hf_tokenizer is None:
+        print('Warning! No HF Tokenizer found!')
+    # Extract the model weights
+    weights_state_dict = composer_state_dict['state']['model']
+    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
+        weights_state_dict, prefix='model.')
+    # Converting weights to desired dtype
+    for k, v in weights_state_dict.items():
+        if isinstance(v, torch.Tensor):
+            weights_state_dict[k] = v.to(dtype=dtype)
+    # Convert the weights using the config and tokenizer to FasterTransformer format
+    print('#' * 30)
+    print('Saving FasterTransformer config...')
+    save_ft_config(composer_config,
+                   tokenizer=hf_tokenizer,
+                   save_dir=save_dir,
+                   weight_data_type=output_precision)
+    print('#' * 30)
+    print('Converting weights to FasterTransformer format...')
+    convert_and_save_ft_weights(named_params=weights_state_dict,
+                                config=composer_config,
+                                infer_gpu_num=infer_gpu_num,
+                                weight_data_type=output_precision,
+                                save_dir=save_dir)
+    print('#' * 30)
+    print(
+        f'FasterTransformer checkpoint folder successfully created at {save_dir}.'
+    )
+    print('Done.')
+    print('#' * 30)
+def parse_args() -> Namespace:
+    """Parse commandline arguments."""
+    parser = ArgumentParser(
+        description=
+        'Convert an MPT Composer checkpoint into a standard FasterTransformer checkpoint folder.'
+    )
+    parser.add_argument(
+        '--composer_path',
+        '-i',
+        type=str,
+        help='Composer checkpoint path. Can be a local file path or cloud URI',
+        required=True)
+    parser.add_argument(
+        '--local_checkpoint_save_location',
+        type=str,
+        help='If specified, where to save the checkpoint file to locally. \
+                            If the input ``checkpoint_path`` is already a local path, this will be a symlink. \
+                            Defaults to None, which will use a temporary file.',
+        default=None)
+    parser.add_argument(
+        '--ft_save_dir',
+        '-o',
+        type=str,
+        help='Directory to save FasterTransformer converted checkpoint in',
+        required=True)
+    parser.add_argument('--infer_gpu_num',
+                        '-i_g',
+                        type=int,
+                        help='How many gpus for inference?',
+                        required=True)
+    parser.add_argument(
+        '--force',
+        action='store_true',
+        help=
+        'Force conversion to FT even if some features may not work as expected in FT'
+    )
+    parser.add_argument(
+        '--output_precision',
+        type=str,
+        help=
+        'Data type of weights in the FasterTransformer output model. Input checkpoint weights will be converted to this dtype.',
+        choices=['fp32', 'fp16'],
+        default='fp32')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = parse_args()
+    print('\n=============== Argument ===============')
+    for key in vars(args):
+        print('{}: {}'.format(key, vars(args)[key]))
+    print('========================================')
+    save_dir = os.path.join(args.ft_save_dir, f'{args.infer_gpu_num}-gpu')
+    if os.path.exists(save_dir) == False:
+        os.makedirs(save_dir)
+    else:
+        raise RuntimeError(f'Output path {save_dir} already exists!')
+    write_ft_checkpoint_from_composer_checkpoint(
+        checkpoint_path=args.composer_path,
+        infer_gpu_num=args.infer_gpu_num,
+        save_dir=save_dir,
+        output_precision=args.output_precision,
+        local_checkpoint_save_location=args.local_checkpoint_save_location)

Perceptrix/finetune/build/lib/inference/convert_composer_to_hf.py ADDED Viewed

	@@ -0,0 +1,290 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import os
+import tempfile
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from typing import Optional, Tuple, Union
+import torch
+import transformers
+from composer.models.huggingface import get_hf_config_from_composer_state_dict
+from composer.utils import (get_file, maybe_create_object_store_from_uri,
+                            parse_uri, safe_torch_load)
+from transformers import PretrainedConfig, PreTrainedTokenizerBase
+from llmfoundry import MPTConfig, MPTForCausalLM
+from llmfoundry.utils import get_hf_tokenizer_from_composer_state_dict
+from llmfoundry.utils.huggingface_hub_utils import \
+    edit_files_for_hf_compatibility
+def write_huggingface_pretrained_from_composer_checkpoint(
+    checkpoint_path: Union[Path, str],
+    output_path: Union[Path, str],
+    output_precision: str = 'fp32',
+    local_checkpoint_save_location: Optional[Union[Path, str]] = None
+) -> Tuple[PretrainedConfig, Optional[PreTrainedTokenizerBase]]:
+    """Convert a Composer checkpoint to a pretrained HF checkpoint folder.
+    Write a ``config.json`` and ``pytorch_model.bin``, like
+    :meth:`transformers.PreTrainedModel.from_pretrained` expects, from a
+    composer checkpoint.
+    .. note:: This function will not work properly if you used surgery algorithms when you trained your model. In that case you will want to
+        load the model weights using the Composer :class:`~composer.Trainer` with the ``load_path`` argument.
+    .. testsetup::
+        import torch
+        dataset = RandomTextClassificationDataset(size=16, use_keys=True)
+        train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
+        eval_dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
+        import transformers
+        from composer.models import HuggingFaceModel
+        from composer.trainer import Trainer
+        hf_model = transformers.AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2)
+        hf_tokenizer = transformers.AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')
+        composer_model = HuggingFaceModel(hf_model, tokenizer=hf_tokenizer, metrics=[], use_logits=True)
+        trainer = Trainer(model=composer_model,
+                            train_dataloader=train_dataloader,
+                            save_filename='composer-hf-checkpoint.pt',
+                            max_duration='1ep',
+                            save_folder='./')
+        trainer.fit()
+        trainer.close()
+    Example:
+    .. testcode::
+        from composer.models import write_huggingface_pretrained_from_composer_checkpoint
+        write_huggingface_pretrained_from_composer_checkpoint('composer-hf-checkpoint.pt', './hf-save-pretrained-output')
+        loaded_model = transformers.AutoModelForSequenceClassification.from_pretrained('./hf-save-pretrained-output')
+    Args:
+        checkpoint_path (Union[Path, str]): Path to the composer checkpoint, can be a local path, or a remote path beginning with ``s3://``, or another backend
+            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+        output_path (Union[Path, str]): Path to the folder to write the output to.
+        output_precision (str, optional): The precision of the output weights saved to `pytorch_model.bin`. Can be one of ``fp32``, ``fp16``, or ``bf16``.
+        local_checkpoint_save_location (Optional[Union[Path, str]], optional): If specified, where to save the checkpoint file to locally.
+                                                                                If the input ``checkpoint_path`` is already a local path, this will be a symlink.
+                                                                                Defaults to None, which will use a temporary file.
+    """
+    dtype = {
+        'fp32': torch.float32,
+        'fp16': torch.float16,
+        'bf16': torch.bfloat16,
+    }[output_precision]
+    # default local path to a tempfile if path is not provided
+    if local_checkpoint_save_location is None:
+        tmp_dir = tempfile.TemporaryDirectory()
+        local_checkpoint_save_location = Path(
+            tmp_dir.name) / 'local-composer-checkpoint.pt'
+    # create folder
+    os.makedirs(output_path)
+    # download the checkpoint file
+    print(
+        f'Downloading checkpoint from {checkpoint_path} -> {local_checkpoint_save_location}'
+    )
+    get_file(str(checkpoint_path), str(local_checkpoint_save_location))
+    # Load the Composer checkpoint state dict
+    print('Loading checkpoint into CPU RAM...')
+    composer_state_dict = safe_torch_load(local_checkpoint_save_location)
+    if 'state' not in composer_state_dict:
+        raise RuntimeError(
+            f'"state" is not an available key in the provided composer checkpoint. Is {local_checkpoint_save_location} ill-formed?'
+        )
+    # Build and save HF Config
+    print('#' * 30)
+    print('Saving HF Model Config...')
+    hf_config = get_hf_config_from_composer_state_dict(composer_state_dict)
+    hf_config.torch_dtype = dtype
+    hf_config.save_pretrained(output_path)
+    print(hf_config)
+    # Extract and save the HF tokenizer
+    print('#' * 30)
+    print('Saving HF Tokenizer...')
+    hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
+        composer_state_dict)
+    if hf_tokenizer is not None:
+        hf_tokenizer.save_pretrained(output_path)
+        print(hf_tokenizer)
+    else:
+        print('Warning! No HF Tokenizer found!')
+    # Extract the HF model weights
+    print('#' * 30)
+    print('Saving HF Model Weights...')
+    weights_state_dict = composer_state_dict
+    if 'state' in weights_state_dict:
+        weights_state_dict = weights_state_dict['state']['model']
+    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
+        weights_state_dict, prefix='model.')
+    # Convert weights to desired dtype
+    for k, v in weights_state_dict.items():
+        if isinstance(v, torch.Tensor):
+            weights_state_dict[k] = v.to(dtype=dtype)
+    # Save weights
+    torch.save(weights_state_dict, Path(output_path) / 'pytorch_model.bin')
+    print('#' * 30)
+    print(f'HF checkpoint folder successfully created at {output_path}.')
+    return hf_config, hf_tokenizer
+def parse_args() -> Namespace:
+    """Parse commandline arguments."""
+    parser = ArgumentParser(
+        description=
+        'Convert a HuggingFace causal LM in a Composer checkpoint into a standard HuggingFace checkpoint folder, and optionally upload to the hub.'
+    )
+    parser.add_argument('--composer_path', type=str, required=True)
+    parser.add_argument('--hf_output_path', type=str, required=True)
+    parser.add_argument('--local_checkpoint_save_location',
+                        type=str,
+                        default=None)
+    parser.add_argument('--output_precision',
+                        type=str,
+                        choices=['fp32', 'fp16', 'bf16'],
+                        default='fp32')
+    parser.add_argument('--hf_repo_for_upload', type=str, default=None)
+    parser.add_argument('--test_uploaded_model', action='store_true')
+    return parser.parse_args()
+def convert_composer_to_hf(args: Namespace) -> None:
+    print()
+    print('#' * 30)
+    print('Converting Composer checkpoint to HuggingFace checkpoint format...')
+    # Register MPT auto classes so that this script works with MPT
+    # This script will not work without modification for other custom models,
+    # but will work for other HuggingFace causal LMs
+    from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+    CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
+    MPTConfig.register_for_auto_class()
+    MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
+    _, _, local_folder_path = parse_uri(args.hf_output_path)
+    config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint(
+        checkpoint_path=args.composer_path,
+        output_path=local_folder_path,
+        output_precision=args.output_precision,
+        local_checkpoint_save_location=args.local_checkpoint_save_location)
+    dtype = {
+        'fp32': torch.float32,
+        'fp16': torch.float16,
+        'bf16': torch.bfloat16,
+    }[args.output_precision]
+    print(f'Loading model from {local_folder_path}')
+    if config.model_type == 'mpt':
+        config.attn_config['attn_impl'] = 'torch'
+        config.init_device = 'cpu'
+    if config.model_type == 'mpt':
+        loaded_hf_model = MPTForCausalLM.from_pretrained(local_folder_path,
+                                                         config=config,
+                                                         torch_dtype=dtype)
+    else:
+        loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(
+            local_folder_path, config=config, torch_dtype=dtype)
+    delattr(loaded_hf_model.config, '_name_or_path')
+    loaded_hf_model.save_pretrained(local_folder_path)
+    print(f'Loading tokenizer from {local_folder_path}')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(local_folder_path)
+    tokenizer.save_pretrained(local_folder_path)
+    # Only need to edit files for MPT because it has custom code
+    if config.model_type == 'mpt':
+        print('Editing files for HF compatibility...')
+        edit_files_for_hf_compatibility(local_folder_path)
+    object_store = maybe_create_object_store_from_uri(str(args.hf_output_path))
+    if object_store is not None:
+        print(
+            f'Uploading HF checkpoint folder from {local_folder_path} -> {args.hf_output_path}'
+        )
+        for file in os.listdir(local_folder_path):
+            remote_file = os.path.join(local_folder_path, file)
+            local_file = os.path.join(local_folder_path, file)
+            object_store.upload_object(remote_file, local_file)
+    if args.hf_repo_for_upload is not None:
+        from huggingface_hub import HfApi
+        api = HfApi()
+        print(
+            f'Uploading {args.hf_output_path} to HuggingFace Hub at {args.hf_repo_for_upload}'
+        )
+        api.create_repo(repo_id=args.hf_repo_for_upload,
+                        use_auth_token=True,
+                        repo_type='model',
+                        private=True,
+                        exist_ok=True)
+        print('Repo created.')
+        # ignore the full checkpoint file if we now have sharded checkpoint files
+        ignore_patterns = []
+        if any(
+                f.startswith('pytorch_model-00001')
+                for f in os.listdir(args.hf_output_path)):
+            ignore_patterns.append('pytorch_model.bin')
+        api.upload_folder(folder_path=args.hf_output_path,
+                          repo_id=args.hf_repo_for_upload,
+                          use_auth_token=True,
+                          repo_type='model',
+                          ignore_patterns=ignore_patterns)
+        print('Folder uploaded.')
+        if args.test_uploaded_model:
+            print('Testing uploaded model...')
+            hub_model = transformers.AutoModelForCausalLM.from_pretrained(
+                args.hf_repo_for_upload,
+                trust_remote_code=True,
+                use_auth_token=True,
+                torch_dtype=dtype)
+            hub_tokenizer = transformers.AutoTokenizer.from_pretrained(
+                args.hf_repo_for_upload,
+                trust_remote_code=True,
+                use_auth_token=True)
+            assert sum(p.numel() for p in hub_model.parameters()) == sum(
+                p.numel() for p in loaded_hf_model.parameters())
+            assert all(
+                str(type(module1)).split('.')[-2:] == str(type(module2)).split(
+                    '.')[-2:] for module1, module2 in zip(
+                        hub_model.modules(), loaded_hf_model.modules()))
+            assert next(
+                hub_model.parameters()
+            ).dtype == dtype, f'Expected model dtype to be {dtype}, but got {next(hub_model.parameters()).dtype}'
+            print(
+                hub_tokenizer.batch_decode(
+                    hub_model.generate(hub_tokenizer(
+                        'MosaicML is', return_tensors='pt').input_ids,
+                                       max_new_tokens=10)))
+    print(
+        'Composer checkpoint successfully converted to HuggingFace checkpoint format.'
+    )
+if __name__ == '__main__':
+    convert_composer_to_hf(parse_args())

Perceptrix/finetune/build/lib/inference/convert_hf_mpt_to_ft.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert MPT model checkpoint to FT format.
+It's a modified version of
+https://github.com/NVIDIA/FasterTransformer/blob/main/examples/pytorch/gpt/utils/huggingface_gpt_convert.py
+"""
+import argparse
+import configparser
+import os
+import transformers
+from llmfoundry.utils import convert_and_save_ft_weights
+def convert_mpt_to_ft(model_name_or_path: str,
+                      output_dir: str,
+                      infer_gpu_num: int = 1,
+                      weight_data_type: str = 'fp32',
+                      force: bool = False) -> None:
+    """Convert an MPT checkpoint to a FasterTransformer compatible format.
+    Args:
+        model_name_or_path (str): The HF hub name of the model (e.g., mosaicml/mpt-7b) or the path of a directory
+            containing an MPT checkpoint in a local dir.
+        output_dir (str): Path of the directory to save the checkpoint in FT format. The directory must not already exist.
+        infer_gpu_num (int): The number of gpus you are planning to use for inference.
+        weight_data_type (str): Data type of the weights in the input checkpoint.
+        force (bool): force conversion even with unsupported features in FT.
+    """
+    save_dir = os.path.join(output_dir, f'{infer_gpu_num}-gpu')
+    if (os.path.exists(save_dir) == False):
+        os.makedirs(save_dir)
+    else:
+        raise RuntimeError(f'Output path {save_dir} already exists!')
+    # do conversion on cpu
+    torch_device = 'cpu'
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_name_or_path, trust_remote_code=True).to(torch_device)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_name_or_path, trust_remote_code=True)
+    hf_config = vars(model.config)
+    config = configparser.ConfigParser()
+    config['gpt'] = {}
+    try:
+        config['gpt']['model_name'] = 'mpt' if hf_config[
+            '_name_or_path'] == '' else hf_config['_name_or_path']
+        config['gpt']['head_num'] = str(hf_config['n_heads'])
+        n_embd = hf_config['d_model']
+        config['gpt']['size_per_head'] = str(n_embd // hf_config['n_heads'])
+        config['gpt']['inter_size'] = str(n_embd * hf_config['expansion_ratio'])
+        config['gpt']['max_pos_seq_len'] = str(hf_config['max_seq_len'])
+        config['gpt']['num_layer'] = str(hf_config['n_layers'])
+        config['gpt']['vocab_size'] = str(hf_config['vocab_size'])
+        config['gpt']['start_id'] = str(
+            hf_config['bos_token_id']
+        ) if hf_config['bos_token_id'] != None else str(tokenizer.bos_token_id)
+        config['gpt']['end_id'] = str(
+            hf_config['eos_token_id']
+        ) if hf_config['eos_token_id'] != None else str(tokenizer.eos_token_id)
+        config['gpt']['weight_data_type'] = weight_data_type
+        config['gpt']['tensor_para_size'] = str(infer_gpu_num)
+        # nn.LayerNorm default eps is 1e-5
+        config['gpt']['layernorm_eps'] = str(1e-5)
+        if hf_config['attn_config']['alibi']:
+            config['gpt']['has_positional_encoding'] = str(False)
+            config['gpt']['use_attention_linear_bias'] = str(True)
+        if hf_config['attn_config']['clip_qkv'] and not force:
+            raise RuntimeError(
+                'clip_qkv is enabled for this MPT model. This may not work as expected in FT. Use --force to force a conversion.'
+            )
+        if hf_config['attn_config']['qk_ln'] and not force:
+            raise RuntimeError(
+                'qk_ln is enabled for this MPT model. This may not work as expected in FT. Use --force to force a conversion.'
+            )
+        with open(os.path.join(save_dir, 'config.ini'), 'w') as configfile:
+            config.write(configfile)
+    except:
+        print(f'Failed to save the config in config.ini.')
+        raise
+    named_params_dict = {
+        name: param for name, param in model.named_parameters()
+    }
+    convert_and_save_ft_weights(named_params=named_params_dict,
+                                config=hf_config,
+                                infer_gpu_num=infer_gpu_num,
+                                weight_data_type=weight_data_type,
+                                save_dir=save_dir)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--save_dir',
+                        '-o',
+                        type=str,
+                        help='Directory to save converted checkpoint in',
+                        required=True)
+    parser.add_argument(
+        '--name_or_dir',
+        '-i',
+        type=str,
+        help=
+        'HF hub Model name (e.g., mosaicml/mpt-7b) or local dir path to load checkpoint from',
+        required=True)
+    parser.add_argument('--infer_gpu_num',
+                        '-i_g',
+                        type=int,
+                        help='How many gpus for inference?',
+                        required=True)
+    parser.add_argument(
+        '--force',
+        action='store_true',
+        help=
+        'Force conversion to FT even if some features may not work as expected in FT'
+    )
+    parser.add_argument('--weight_data_type',
+                        type=str,
+                        help='Data type of weights in the input checkpoint',
+                        default='fp32',
+                        choices=['fp32', 'fp16'])
+    args = parser.parse_args()
+    print('\n=============== Argument ===============')
+    for key in vars(args):
+        print('{}: {}'.format(key, vars(args)[key]))
+    print('========================================')
+    convert_mpt_to_ft(args.name_or_dir, args.save_dir, args.infer_gpu_num,
+                      args.weight_data_type, args.force)

Perceptrix/finetune/build/lib/inference/convert_hf_to_onnx.py ADDED Viewed

	@@ -0,0 +1,229 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Basic HuggingFace -> ONNX export script.
+This scripts show a basic HuggingFace -> ONNX export workflow. This works for a MPT model
+that has been saved using `MPT.save_pretrained`. For more details and examples
+of exporting and working with HuggingFace models with ONNX, see https://huggingface.co/docs/transformers/serialization#export-to-onnx.
+Example usage:
+    1) Local export
+    python convert_hf_to_onnx.py --pretrained_model_name_or_path local/path/to/huggingface/folder --output_folder local/folder
+    2) Remote export
+    python convert_hf_to_onnx.py --pretrained_model_name_or_path local/path/to/huggingface/folder --output_folder s3://bucket/remote/folder
+    3) Verify the exported model
+    python convert_hf_to_onnx.py --pretrained_model_name_or_path local/path/to/huggingface/folder --output_folder local/folder --verify_export
+    4) Change the batch size or max sequence length
+    python convert_hf_to_onnx.py --pretrained_model_name_or_path local/path/to/huggingface/folder --output_folder local/folder --export_batch_size 1 --max_seq_len 32000
+"""
+import argparse
+import os
+from argparse import ArgumentTypeError
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+import torch
+from composer.utils import (maybe_create_object_store_from_uri, parse_uri,
+                            reproducibility)
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+def str2bool(v: Union[str, bool]):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise ArgumentTypeError('Boolean value expected.')
+def str_or_bool(v: Union[str, bool]):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        return v
+def gen_random_batch(batch_size: int, vocab_size: int, max_seq_len: int):
+    # generate input batch of random data
+    batch = {
+        'input_ids':
+            torch.randint(
+                low=0,
+                high=vocab_size,
+                size=(batch_size, max_seq_len),
+                dtype=torch.int64,
+            ),
+        'attention_mask':
+            torch.ones(size=(batch_size, max_seq_len), dtype=torch.bool)
+    }
+    return batch
+def export_to_onnx(
+    pretrained_model_name_or_path: str,
+    output_folder: str,
+    export_batch_size: int,
+    max_seq_len: Optional[int],
+    verify_export: bool,
+    from_pretrained_kwargs: Dict[str, Any],
+):
+    reproducibility.seed_all(42)
+    save_object_store = maybe_create_object_store_from_uri(output_folder)
+    _, _, parsed_save_path = parse_uri(output_folder)
+    print('Loading HF config/model/tokenizer...')
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
+                                              **from_pretrained_kwargs)
+    config = AutoConfig.from_pretrained(pretrained_model_name_or_path,
+                                        **from_pretrained_kwargs)
+    # specifically for MPT, switch to the torch version of attention for ONNX export
+    if hasattr(config, 'attn_config'):
+        config.attn_config['attn_impl'] = 'torch'
+    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path,
+                                                 config=config,
+                                                 **from_pretrained_kwargs)
+    model.eval()
+    if max_seq_len is None and not hasattr(model.config, 'max_seq_len'):
+        raise ValueError(
+            'max_seq_len must be specified in either the model config or as an argument to this function.'
+        )
+    elif max_seq_len is None:
+        max_seq_len = model.config.max_seq_len
+    assert isinstance(max_seq_len, int)  # pyright
+    print('Creating random batch...')
+    sample_input = gen_random_batch(
+        export_batch_size,
+        len(tokenizer),
+        max_seq_len,
+    )
+    with torch.no_grad():
+        model(**sample_input)
+    output_file = Path(parsed_save_path) / 'model.onnx'
+    os.makedirs(parsed_save_path, exist_ok=True)
+    print('Exporting the model with ONNX...')
+    torch.onnx.export(
+        model,
+        (sample_input,),
+        str(output_file),
+        input_names=['input_ids', 'attention_mask'],
+        output_names=['output'],
+        opset_version=16,
+    )
+    if verify_export:
+        with torch.no_grad():
+            orig_out = model(**sample_input)
+        import onnx
+        import onnx.checker
+        import onnxruntime as ort
+        _ = onnx.load(str(output_file))
+        onnx.checker.check_model(str(output_file))
+        ort_session = ort.InferenceSession(str(output_file))
+        for key, value in sample_input.items():
+            sample_input[key] = value.cpu().numpy()
+        loaded_model_out = ort_session.run(None, sample_input)
+        torch.testing.assert_close(
+            orig_out.logits.detach().numpy(),
+            loaded_model_out[0],
+            rtol=1e-2,
+            atol=1e-2,
+            msg=f'output mismatch between the orig and onnx exported model',
+        )
+        print('exported model ouptut matches with unexported model!!')
+    if save_object_store is not None:
+        print('Uploading files to object storage...')
+        for filename in os.listdir(parsed_save_path):
+            full_path = str(Path(parsed_save_path) / filename)
+            save_object_store.upload_object(full_path, full_path)
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert HF model to ONNX',)
+    parser.add_argument(
+        '--pretrained_model_name_or_path',
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        '--output_folder',
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        '--export_batch_size',
+        type=int,
+        default=8,
+    )
+    parser.add_argument(
+        '--max_seq_len',
+        type=int,
+        default=None,
+    )
+    parser.add_argument(
+        '--verify_export',
+        action='store_true',
+    )
+    parser.add_argument('--trust_remote_code',
+                        type=str2bool,
+                        nargs='?',
+                        const=True,
+                        default=True)
+    parser.add_argument('--use_auth_token',
+                        type=str_or_bool,
+                        nargs='?',
+                        const=True,
+                        default=None)
+    parser.add_argument('--revision', type=str, default=None)
+    return parser.parse_args()
+def main(args: argparse.Namespace):
+    from_pretrained_kwargs = {
+        'use_auth_token': args.use_auth_token,
+        'trust_remote_code': args.trust_remote_code,
+        'revision': args.revision,
+    }
+    export_to_onnx(
+        pretrained_model_name_or_path=args.pretrained_model_name_or_path,
+        output_folder=args.output_folder,
+        export_batch_size=args.export_batch_size,
+        max_seq_len=args.max_seq_len,
+        verify_export=args.verify_export,
+        from_pretrained_kwargs=from_pretrained_kwargs)
+if __name__ == '__main__':
+    main(parse_args())

Perceptrix/finetune/build/lib/inference/hf_chat.py ADDED Viewed

	@@ -0,0 +1,389 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import time
+import warnings
+from argparse import ArgumentParser, ArgumentTypeError, Namespace
+from contextlib import nullcontext
+from typing import Any, Dict, List, Optional, Union
+import torch
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedModel, PreTrainedTokenizerBase,
+                          StoppingCriteria, StoppingCriteriaList, TextStreamer)
+class ChatFormatter:
+    """A class for formatting the chat history.
+    Args:
+        system: The system prompt. If None, a default ChatML-formatted prompt is used.
+        user: The user prompt. If None, a default ChatML value is used.
+        assistant: The assistant prompt. If None, a default ChatML value is used.
+    Attributes:
+        system: The system prompt.
+        user: The user prompt.
+        assistant: The assistant prompt.
+        response_prefix: The response prefix (anything before {} in the assistant format string)
+    """
+    def __init__(self, system: str, user: str, assistant: str) -> None:
+        self.system = system if system else '<|im_start|>system\nA conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.<|im_end|>\n'
+        self.user = user if user else '<|im_start|>user\n{}<|im_end|>\n'
+        self.assistant = assistant if assistant else '<|im_start|>assistant\n{}<|im_end|>\n'
+        self.response_prefix = self.assistant.split('{}')[0]
+class Conversation:
+    """A class for interacting with a chat-tuned LLM.
+    Args:
+        model: The model to use for inference.
+        tokenizer: The tokenizer to use for inference.
+        chat_format: The chat format to use for the conversation.
+        generate_kwargs: The keyword arguments to pass to `model.generate`.
+        stop_tokens: The tokens to stop generation on.
+    Attributes:
+        model: The model to use for inference.
+        tokenizer: The tokenizer to use for inference.
+        chat_format: The chat format to use for the conversation.
+        streamer: The streamer to use for inference.
+        generate_kwargs: The keyword arguments to pass to `model.generate`.
+        history: The conversation history.
+        cli_instructions: The instructions to display to the user.
+    """
+    def __init__(self,
+                 model: PreTrainedModel,
+                 tokenizer: PreTrainedTokenizerBase,
+                 chat_format: ChatFormatter,
+                 generate_kwargs: Dict[str, Any],
+                 stop_tokens: Optional[List[str]] = None) -> None:
+        if stop_tokens is None:
+            stop_tokens = ['<|endoftext|>', '<|im_end|>']
+        self.model = model
+        self.tokenizer = tokenizer
+        self.chat_format = chat_format
+        stop_token_ids = self.tokenizer.convert_tokens_to_ids(stop_tokens)
+        if len(stop_token_ids) != len(stop_tokens):
+            warnings.warn(
+                f'Not all stop tokens were found in the tokenizer vocabulary: {stop_tokens}\n'
+                + 'Generation may stop or continue unexpectedly.')
+        class StopOnTokens(StoppingCriteria):
+            def __call__(self, input_ids: torch.LongTensor,
+                         scores: torch.FloatTensor, **kwargs: Any) -> bool:
+                del kwargs  # unused
+                for stop_id in stop_token_ids:
+                    if input_ids[0][-1] == stop_id:
+                        return True
+                return False
+        self.streamer = TextStreamer(tokenizer,
+                                     skip_prompt=True,
+                                     skip_special_tokens=True)
+        self.generate_kwargs = {
+            **generate_kwargs,
+            'stopping_criteria':
+                StoppingCriteriaList([StopOnTokens()]),
+            'streamer':
+                self.streamer,
+        }
+        self.history = []
+        self.cli_instructions = (
+            'Enter your message below.\n- Hit return twice to send input to the model\n'
+            +
+            "- Type 'clear' to restart the conversation\n- Type 'history' to see the conversation\n"
+            +
+            "- Type 'quit' to end\n- Type 'system' to change the system prompt\n"
+        )
+    def _history_as_formatted_str(self) -> str:
+        text = self.chat_format.system + ''.join([
+            '\n'.join([
+                self.chat_format.user.format(item[0]),
+                self.chat_format.assistant.format(item[1]),
+            ]) for item in self.history[:-1]
+        ])
+        text += self.chat_format.user.format(self.history[-1][0])
+        text += self.chat_format.response_prefix
+        return text
+    def turn(self, user_inp: str) -> None:
+        self.history.append([user_inp, ''])
+        conversation = self._history_as_formatted_str()
+        input_ids = self.tokenizer(conversation, return_tensors='pt').input_ids
+        input_ids = input_ids.to(self.model.device)
+        # also stream to stdout
+        maybe_synchronize()
+        start = time.time()
+        print('Assistant:')
+        gkwargs = {**self.generate_kwargs, 'input_ids': input_ids}
+        # this will stream to stdout, but we need to keep track of the output_ids for saving history
+        output_ids = self.model.generate(**gkwargs)
+        maybe_synchronize()
+        end = time.time()
+        print(f'took {end - start:.2f} seconds')
+        new_tokens = output_ids[0, len(input_ids[0]):]
+        assistant_response = self.tokenizer.decode(new_tokens,
+                                                   skip_special_tokens=True)
+        self.history[-1][-1] = assistant_response
+    def __call__(self) -> None:
+        print(self.cli_instructions)
+        while True:
+            print('User:')
+            user_inp_lines = []
+            while True:
+                line = input()
+                if line.strip() == '':
+                    break
+                user_inp_lines.append(line)
+            user_inp = '\n'.join(user_inp_lines)
+            if user_inp.lower() == 'quit':
+                break
+            elif user_inp.lower() == 'clear':
+                self.history = []
+                continue
+            elif user_inp == 'history':
+                print(f'history: {self.history}')
+                continue
+            elif user_inp == 'history_fmt':
+                print(f'history: {self._history_as_formatted_str()}')
+                continue
+            elif user_inp == 'system':
+                print('Enter a new system prompt:')
+                new_system = input()
+                sys = f'<|im_start|>system\n{new_system.strip()}.<|im_end|>\n'
+                self.chat_format.system = sys
+                continue
+            self.turn(user_inp)
+def get_dtype(dtype: str):
+    if dtype == 'fp32':
+        return torch.float32
+    elif dtype == 'fp16':
+        return torch.float16
+    elif dtype == 'bf16':
+        return torch.bfloat16
+    else:
+        raise NotImplementedError(
+            f'dtype {dtype} is not supported. ' +
+            'We only support fp32, fp16, and bf16 currently')
+def str2bool(v: Union[str, bool]):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise ArgumentTypeError('Boolean value expected.')
+def str_or_bool(v: Union[str, bool]):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        return v
+def parse_args() -> Namespace:
+    """Parse commandline arguments."""
+    parser = ArgumentParser(
+        description='Load a HF CausalLM Model and use it to generate text.')
+    parser.add_argument('-n', '--name_or_path', type=str, required=True)
+    parser.add_argument('--max_new_tokens', type=int, default=512)
+    parser.add_argument('--max_seq_len', type=int, default=None)
+    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--top_k', type=int, default=50)
+    parser.add_argument('--top_p', type=float, default=1.0)
+    parser.add_argument('--do_sample',
+                        type=str2bool,
+                        nargs='?',
+                        const=True,
+                        default=True)
+    parser.add_argument('--use_cache',
+                        type=str2bool,
+                        nargs='?',
+                        const=True,
+                        default=True)
+    parser.add_argument('--eos_token_id', type=str, default=None)
+    parser.add_argument('--pad_token_id', type=str, default=None)
+    parser.add_argument('--model_dtype',
+                        type=str,
+                        choices=['fp32', 'fp16', 'bf16'],
+                        default=None)
+    parser.add_argument('--autocast_dtype',
+                        type=str,
+                        choices=['fp32', 'fp16', 'bf16'],
+                        default=None)
+    parser.add_argument('--warmup',
+                        type=str2bool,
+                        nargs='?',
+                        const=True,
+                        default=True)
+    parser.add_argument('--trust_remote_code',
+                        type=str2bool,
+                        nargs='?',
+                        const=True,
+                        default=True)
+    parser.add_argument('--use_auth_token',
+                        type=str_or_bool,
+                        nargs='?',
+                        const=True,
+                        default=None)
+    parser.add_argument('--revision', type=str, default=None)
+    parser.add_argument('--device', type=str, default=None)
+    parser.add_argument('--device_map', type=str, default=None)
+    parser.add_argument('--attn_impl', type=str, default=None)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--system_prompt', type=str, default=None)
+    parser.add_argument('--user_msg_fmt', type=str, default=None)
+    parser.add_argument('--assistant_msg_fmt', type=str, default=None)
+    parser.add_argument(
+        '--stop_tokens',
+        type=str,
+        default='<|endoftext|> <|im_end|>',
+        help='A string of tokens to stop generation on; will be split on spaces.'
+    )
+    return parser.parse_args()
+def maybe_synchronize():
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+def main(args: Namespace) -> None:
+    # Set device or device_map
+    if args.device and args.device_map:
+        raise ValueError('You can only set one of `device` and `device_map`.')
+    if args.device is not None:
+        device = args.device
+        device_map = None
+    else:
+        device = None
+        device_map = args.device_map or 'auto'
+    print(f'Using {device=} and {device_map=}')
+    # Set model_dtype
+    if args.model_dtype is not None:
+        model_dtype = get_dtype(args.model_dtype)
+    else:
+        model_dtype = torch.float32
+    print(f'Using {model_dtype=}')
+    # Grab config first
+    print(f'Loading HF Config...')
+    from_pretrained_kwargs = {
+        'use_auth_token': args.use_auth_token,
+        'trust_remote_code': args.trust_remote_code,
+        'revision': args.revision,
+    }
+    try:
+        config = AutoConfig.from_pretrained(args.name_or_path,
+                                            **from_pretrained_kwargs)
+        if args.attn_impl is not None and hasattr(config, 'attn_config'):
+            config.attn_config['attn_impl'] = args.attn_impl
+        if hasattr(config, 'init_device') and device is not None:
+            config.init_device = device
+        if args.max_seq_len is not None and hasattr(config, 'max_seq_len'):
+            config.max_seq_len = args.max_seq_len
+    except Exception as e:
+        raise RuntimeError(
+            'If you are having auth problems, try logging in via `huggingface-cli login` '
+            +
+            'or by setting the environment variable `export HUGGING_FACE_HUB_TOKEN=... '
+            +
+            'using your access token from https://huggingface.co/settings/tokens.'
+        ) from e
+    # Load HF Model
+    print(f'Loading HF model with dtype={model_dtype}...')
+    try:
+        model = AutoModelForCausalLM.from_pretrained(args.name_or_path,
+                                                     config=config,
+                                                     torch_dtype=model_dtype,
+                                                     device_map=device_map,
+                                                     **from_pretrained_kwargs)
+        model.eval()
+        print(f'n_params={sum(p.numel() for p in model.parameters())}')
+        if device is not None:
+            print(f'Placing model on {device=}...')
+            model.to(device)
+    except Exception as e:
+        raise RuntimeError(
+            'Unable to load HF model. ' +
+            'If you are having auth problems, try logging in via `huggingface-cli login` '
+            +
+            'or by setting the environment variable `export HUGGING_FACE_HUB_TOKEN=... '
+            +
+            'using your access token from https://huggingface.co/settings/tokens.'
+        ) from e
+    print('\nLoading HF tokenizer...')
+    tokenizer = AutoTokenizer.from_pretrained(args.name_or_path,
+                                              **from_pretrained_kwargs)
+    if tokenizer.pad_token_id is None:
+        warnings.warn(
+            'pad_token_id is not set for the tokenizer. Using eos_token_id as pad_token_id.'
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = 'left'
+    generate_kwargs = {
+        'max_new_tokens': args.max_new_tokens,
+        'temperature': args.temperature,
+        'top_p': args.top_p,
+        'top_k': args.top_k,
+        'use_cache': args.use_cache,
+        'do_sample': args.do_sample,
+        'eos_token_id': args.eos_token_id or tokenizer.eos_token_id,
+        'pad_token_id': args.pad_token_id or tokenizer.eos_token_id,
+    }
+    # Autocast
+    if args.autocast_dtype is not None:
+        autocast_dtype = get_dtype(args.autocast_dtype)
+        autocast_context = torch.autocast(model.device.type, autocast_dtype)
+        print(f'Using autocast with dtype={autocast_dtype}...')
+    else:
+        autocast_context = nullcontext()
+        print('NOT using autocast...')
+    chat_format = ChatFormatter(system=args.system_prompt,
+                                user=args.user_msg_fmt,
+                                assistant=args.assistant_msg_fmt)
+    conversation = Conversation(model=model,
+                                tokenizer=tokenizer,
+                                chat_format=chat_format,
+                                generate_kwargs=generate_kwargs,
+                                stop_tokens=args.stop_tokens.split())
+    # Warmup
+    if args.warmup:
+        print('Warming up...')
+        with autocast_context:
+            conversation.turn('Write a welcome message to the user.')
+            conversation.history = []
+    print('Starting conversation...')
+    with autocast_context:
+        conversation()
+if __name__ == '__main__':
+    main(parse_args())

Perceptrix/finetune/build/lib/inference/hf_generate.py ADDED Viewed

	@@ -0,0 +1,372 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import itertools
+import os
+import random
+import time
+import warnings
+from argparse import ArgumentParser, ArgumentTypeError, Namespace
+from contextlib import nullcontext
+from typing import Dict, Union
+import numpy as np
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+def get_dtype(dtype: str):
+    if dtype == 'fp32':
+        return torch.float32
+    elif dtype == 'fp16':
+        return torch.float16
+    elif dtype == 'bf16':
+        return torch.bfloat16
+    else:
+        raise NotImplementedError(
+            f'dtype {dtype} is not supported. ' +\
+            f'We only support fp32, fp16, and bf16 currently')
+def str2bool(v: Union[str, bool]):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise ArgumentTypeError('Boolean value expected.')
+def str_or_bool(v: Union[str, bool]):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        return v
+def parse_args() -> Namespace:
+    """Parse commandline arguments."""
+    parser = ArgumentParser(
+        description='Load a HF CausalLM Model and use it to generate text.')
+    parser.add_argument('-n', '--name_or_path', type=str, required=True)
+    parser.add_argument(
+        '-p',
+        '--prompts',
+        nargs='+',
+        default=[
+            'My name is',
+            'This is an explanation of deep learning to a five year old. Deep learning is',
+        ],
+        help='Generation prompts. Use syntax "file::/path/to/prompt.txt" to load a ' +\
+             'prompt contained in a txt file.'
+        )
+    parser.add_argument('--max_seq_len', type=int, default=None)
+    parser.add_argument('--max_new_tokens', type=int, default=100)
+    parser.add_argument('--max_batch_size', type=int, default=None)
+    #####
+    # Note: Generation config defaults are set to match Hugging Face defaults
+    parser.add_argument('--temperature', type=float, nargs='+', default=[1.0])
+    parser.add_argument('--top_k', type=int, nargs='+', default=[50])
+    parser.add_argument('--top_p', type=float, nargs='+', default=[1.0])
+    parser.add_argument('--repetition_penalty',
+                        type=float,
+                        nargs='+',
+                        default=[1.0])
+    parser.add_argument('--no_repeat_ngram_size',
+                        type=int,
+                        nargs='+',
+                        default=[0])
+    #####
+    parser.add_argument('--seed', type=int, nargs='+', default=[42])
+    parser.add_argument('--do_sample',
+                        type=str2bool,
+                        nargs='?',
+                        const=True,
+                        default=True)
+    parser.add_argument('--use_cache',
+                        type=str2bool,
+                        nargs='?',
+                        const=True,
+                        default=True)
+    parser.add_argument('--eos_token_id', type=int, default=None)
+    parser.add_argument('--pad_token_id', type=int, default=None)
+    parser.add_argument('--model_dtype',
+                        type=str,
+                        choices=['fp32', 'fp16', 'bf16'],
+                        default=None)
+    parser.add_argument('--autocast_dtype',
+                        type=str,
+                        choices=['fp32', 'fp16', 'bf16'],
+                        default=None)
+    parser.add_argument('--warmup',
+                        type=str2bool,
+                        nargs='?',
+                        const=True,
+                        default=True)
+    parser.add_argument('--trust_remote_code',
+                        type=str2bool,
+                        nargs='?',
+                        const=True,
+                        default=True)
+    parser.add_argument('--use_auth_token',
+                        type=str_or_bool,
+                        nargs='?',
+                        const=True,
+                        default=None)
+    parser.add_argument('--revision', type=str, default=None)
+    parser.add_argument('--device', type=str, default=None)
+    parser.add_argument('--device_map', type=str, default=None)
+    parser.add_argument('--attn_impl', type=str, default=None)
+    return parser.parse_args()
+def load_prompt_string_from_file(prompt_path_str: str):
+    if not prompt_path_str.startswith('file::'):
+        raise ValueError('prompt_path_str must start with "file::".')
+    _, prompt_file_path = prompt_path_str.split('file::', maxsplit=1)
+    prompt_file_path = os.path.expanduser(prompt_file_path)
+    if not os.path.isfile(prompt_file_path):
+        raise FileNotFoundError(
+            f'{prompt_file_path=} does not match any existing files.')
+    with open(prompt_file_path, 'r') as f:
+        prompt_string = ''.join(f.readlines())
+    return prompt_string
+def maybe_synchronize():
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+def main(args: Namespace) -> None:
+    # Set device or device_map
+    if args.device and args.device_map:
+        raise ValueError('You can only set one of `device` and `device_map`.')
+    if args.device is not None:
+        device = args.device
+        device_map = None
+    else:
+        device = None
+        device_map = args.device_map or 'auto'
+    print(f'Using {device=} and {device_map=}')
+    # Set model_dtype
+    if args.model_dtype is not None:
+        model_dtype = get_dtype(args.model_dtype)
+    else:
+        model_dtype = torch.float32
+    print(f'Using {model_dtype=}')
+    # Load prompts
+    prompt_strings = []
+    for prompt in args.prompts:
+        if prompt.startswith('file::'):
+            prompt = load_prompt_string_from_file(prompt)
+        prompt_strings.append(prompt)
+    # Grab config first
+    print(f'Loading HF Config...')
+    from_pretrained_kwargs = {
+        'use_auth_token': args.use_auth_token,
+        'trust_remote_code': args.trust_remote_code,
+        'revision': args.revision,
+    }
+    try:
+        config = AutoConfig.from_pretrained(args.name_or_path,
+                                            **from_pretrained_kwargs)
+        if hasattr(config, 'init_device') and device is not None:
+            config.init_device = device
+        if args.attn_impl is not None and hasattr(config, 'attn_config'):
+            config.attn_config['attn_impl'] = args.attn_impl
+        if args.max_seq_len is not None and hasattr(config, 'max_seq_len'):
+            config.max_seq_len = args.max_seq_len
+    except Exception as e:
+        raise RuntimeError(
+            'If you are having auth problems, try logging in via `huggingface-cli login` ' +\
+            'or by setting the environment variable `export HUGGING_FACE_HUB_TOKEN=... ' +\
+            'using your access token from https://huggingface.co/settings/tokens.'
+        ) from e
+    # Build tokenizer
+    print('\nLoading HF tokenizer...')
+    tokenizer = AutoTokenizer.from_pretrained(args.name_or_path,
+                                              **from_pretrained_kwargs)
+    if tokenizer.pad_token_id is None:
+        warnings.warn(
+            'pad_token_id is not set for the tokenizer. Using eos_token_id as pad_token_id.'
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = 'left'
+    # Load HF Model
+    print(f'Loading HF model with dtype={model_dtype}...')
+    try:
+        model = AutoModelForCausalLM.from_pretrained(args.name_or_path,
+                                                     config=config,
+                                                     torch_dtype=model_dtype,
+                                                     device_map=device_map,
+                                                     **from_pretrained_kwargs)
+        model.eval()
+        print(f'n_params={sum(p.numel() for p in model.parameters())}')
+        if device is not None:
+            print(f'Placing model on {device=}...')
+            model.to(device)
+    except Exception as e:
+        raise RuntimeError(
+            'Unable to load HF model. ' +
+            'If you are having auth problems, try logging in via `huggingface-cli login` '
+            +
+            'or by setting the environment variable `export HUGGING_FACE_HUB_TOKEN=... '
+            +
+            'using your access token from https://huggingface.co/settings/tokens.'
+        ) from e
+    # Autocast
+    if args.autocast_dtype is not None:
+        autocast_dtype = get_dtype(args.autocast_dtype)
+        autocast_context = torch.autocast(model.device.type, autocast_dtype)
+        print(f'Using autocast with dtype={autocast_dtype}...')
+    else:
+        autocast_context = nullcontext()
+        print('NOT using autocast...')
+    done_warmup = False
+    for temp, topp, topk, repp, nrnz, seed in itertools.product(
+            args.temperature, args.top_p, args.top_k, args.repetition_penalty,
+            args.no_repeat_ngram_size, args.seed):
+        # Seed randomness
+        random.seed(seed)
+        torch.manual_seed(seed)
+        print(f'\nGenerate seed:\n{seed}')
+        generate_kwargs = {
+            'max_new_tokens': args.max_new_tokens,
+            'temperature': temp,
+            'top_p': topp,
+            'top_k': topk,
+            'repetition_penalty': repp,
+            'no_repeat_ngram_size': nrnz,
+            'use_cache': args.use_cache,
+            'do_sample': False if temp == 0 else args.do_sample,
+            'eos_token_id': args.eos_token_id or tokenizer.eos_token_id,
+            'pad_token_id': args.pad_token_id or tokenizer.pad_token_id,
+        }
+        print(f'\nGenerate kwargs:\n{generate_kwargs}')
+        # Generate function with correct context managers
+        def _generate(encoded_inp: Dict[str, torch.Tensor]):
+            with torch.no_grad():
+                with autocast_context:
+                    return model.generate(
+                        input_ids=encoded_inp['input_ids'],
+                        attention_mask=encoded_inp['attention_mask'],
+                        **generate_kwargs,
+                    )
+        # Split into prompt batches
+        batches = []
+        if args.max_batch_size:
+            bs = args.max_batch_size
+            batches = [
+                prompt_strings[i:i + bs]
+                for i in range(0, len(prompt_strings), bs)
+            ]
+        else:
+            batches = [prompt_strings]
+        for batch in batches:
+            print(f'\nTokenizing prompts...')
+            maybe_synchronize()
+            encode_start = time.time()
+            encoded_inp = tokenizer(batch, return_tensors='pt', padding=True)
+            for key, value in encoded_inp.items():
+                encoded_inp[key] = value.to(model.device)
+            maybe_synchronize()
+            encode_end = time.time()
+            input_tokens = torch.sum(
+                encoded_inp['input_ids'] !=
+                tokenizer.pad_token_id,  # type: ignore
+                axis=1).numpy(force=True)
+            # Warmup
+            if args.warmup and (not done_warmup):
+                print('Warming up...')
+                _ = _generate(encoded_inp)
+                done_warmup = True
+            # Run HF generate
+            print('Generating responses...')
+            maybe_synchronize()
+            gen_start = time.time()
+            encoded_gen = _generate(encoded_inp)
+            maybe_synchronize()
+            gen_end = time.time()
+            decode_start = time.time()
+            decoded_gen = tokenizer.batch_decode(encoded_gen,
+                                                 skip_special_tokens=True)
+            maybe_synchronize()
+            decode_end = time.time()
+            gen_tokens = torch.sum(encoded_gen != tokenizer.pad_token_id,
+                                   axis=1).numpy(force=True)  # type: ignore
+            # Print generations
+            delimiter = '#' * 100
+            # decode the encoded prompt to handle the case when the tokenizer
+            # trims extra spaces or does other pre-tokenization things
+            effective_prompts = tokenizer.batch_decode(encoded_inp['input_ids'],
+                                                       skip_special_tokens=True)
+            for idx, (effective_prompt, prompt, gen) in enumerate(
+                    zip(effective_prompts, batch, decoded_gen)):
+                continuation = gen[len(effective_prompt):]
+                print(delimiter)
+                if len(continuation) > 0:
+                    print('\033[92m' + prompt + '\033[0m' + continuation)
+                else:
+                    print('Warning. No non-special output tokens generated.')
+                    print(
+                        'This can happen if the generation only contains padding/eos tokens.'
+                    )
+                    print('Debug:')
+                    full_generation = tokenizer.batch_decode(
+                        encoded_gen, skip_special_tokens=False)[idx]
+                    print('\033[92m' + 'Prompt:\n' + prompt + '\033[0m')
+                    print('Full generation:\n' + full_generation)
+            print(delimiter)
+            # Print timing info
+            bs = len(batch)
+            # ensure that gen_tokens >= 1 in case model only generated padding tokens
+            gen_tokens = np.maximum(gen_tokens, np.ones_like(gen_tokens))
+            output_tokens = gen_tokens - input_tokens
+            total_input_tokens = input_tokens.sum()
+            total_output_tokens = output_tokens.sum()
+            encode_latency = 1000 * (encode_end - encode_start)
+            gen_latency = 1000 * (gen_end - gen_start)
+            decode_latency = 1000 * (decode_end - decode_start)
+            total_latency = encode_latency + gen_latency + decode_latency
+            latency_per_output_token = total_latency / total_output_tokens
+            output_tok_per_sec = 1000 / latency_per_output_token
+            print(f'{bs=}, {input_tokens=}, {output_tokens=}')
+            print(f'{total_input_tokens=}, {total_output_tokens=}')
+            print(
+                f'{encode_latency=:.2f}ms, {gen_latency=:.2f}ms, {decode_latency=:.2f}ms, {total_latency=:.2f}ms'
+            )
+            print(f'{latency_per_output_token=:.2f}ms/tok')
+            print(f'{output_tok_per_sec=:.2f}tok/sec')
+if __name__ == '__main__':
+    main(parse_args())

Perceptrix/finetune/build/lib/inference/run_mpt_with_ft.py ADDED Viewed

	@@ -0,0 +1,480 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run MPT model with FT.
+This script is a modified version of
+https://github.com/NVIDIA/FasterTransformer/blob/main/examples/pytorch/gpt/multi_gpu_gpt_example.py
+"""
+import argparse
+import configparser
+import os
+import sys
+import timeit
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoTokenizer
+dir_path = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.join(dir_path, '../../..'))
+from examples.pytorch.gpt.utils import comm, gpt_decoder
+from examples.pytorch.gpt.utils.parallel_gpt import ParallelGPT
+@torch.no_grad()
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--layer_num',
+                        type=int,
+                        default=32,
+                        help='number of layers')
+    parser.add_argument('--input_len',
+                        type=int,
+                        default=128,
+                        help='input sequence length to generate.')
+    parser.add_argument('--output_len',
+                        type=int,
+                        default=64,
+                        help='output sequence length to generate.')
+    parser.add_argument('--head_num', type=int, default=32, help='head number')
+    parser.add_argument('--size_per_head',
+                        type=int,
+                        default=128,
+                        help='size per head')
+    parser.add_argument('--vocab_size',
+                        type=int,
+                        default=50432,
+                        help='vocab size')
+    parser.add_argument(
+        '--beam_width',
+        type=int,
+        default=1,
+        help='beam width for beam search. Using sampling when beam width is 1.')
+    parser.add_argument('--top_k',
+                        type=int,
+                        default=1,
+                        help='top k candidate num')
+    parser.add_argument('--top_p',
+                        type=float,
+                        default=0.95,
+                        help='top p probability threshold')
+    parser.add_argument('--temperature',
+                        type=float,
+                        default=0.8,
+                        help='temperature')
+    parser.add_argument('--len_penalty',
+                        type=float,
+                        default=0.,
+                        help='len_penalty')
+    parser.add_argument('--beam_search_diversity_rate',
+                        type=float,
+                        default=0.,
+                        help='beam_search_diversity_rate')
+    parser.add_argument('--tensor_para_size',
+                        type=int,
+                        default=1,
+                        help='tensor parallel size')
+    parser.add_argument('--pipeline_para_size',
+                        type=int,
+                        default=1,
+                        help='pipeline parallel size')
+    parser.add_argument('--ckpt_path',
+                        type=str,
+                        default='mpt-ft-7b/1-gpu',
+                        help='path to the FT checkpoint file.')
+    parser.add_argument(
+        '--tokenizer_name_or_path',
+        type=str,
+        default='EleutherAI/gpt-neox-20b',
+        help=
+        'Name of the tokenizer or the directory where the tokenizer file is located.'
+    )
+    parser.add_argument(
+        '--lib_path',
+        type=str,
+        help=
+        'path to the libth_transformer dynamic lib file(.e.g., build/lib/libth_transformer.so.'
+    )
+    parser.add_argument('--start_id',
+                        type=int,
+                        default=0,
+                        help='start token id.')
+    parser.add_argument('--end_id', type=int, default=0, help='end token id.')
+    parser.add_argument(
+        '--max_batch_size',
+        type=int,
+        default=8,
+        help=
+        'Max batch size. If sample_input_file is given, it is truncated to this max_batch_size, otherwise, this value is used as batch size.'
+    )
+    parser.add_argument('--repetition_penalty',
+                        type=float,
+                        default=5.,
+                        help='repetition penalty')
+    parser.add_argument(
+        '--presence_penalty',
+        type=float,
+        default=0.,
+        help=
+        'presence penalty. Similar to repetition, but additive rather than multiplicative.'
+    )
+    parser.add_argument('--min_length',
+                        type=int,
+                        default=0,
+                        help='A minimum number of tokens to generate')
+    parser.add_argument(
+        '--max_seq_len',
+        type=int,
+        default=2048,
+        help='max sequence length for position embedding table.')
+    parser.add_argument('--inference_data_type',
+                        '--data_type',
+                        type=str,
+                        choices=['fp32', 'fp16', 'bf16'],
+                        default='bf16')
+    parser.add_argument('--time',
+                        action='store_true',
+                        help='whether or not to measure time elapsed.')
+    parser.add_argument(
+        '--sample_input_file',
+        type=str,
+        default=None,
+        help=
+        'path to sample input file. If not set, it runs with no context inputs.'
+    )
+    parser.add_argument('--sample_output_file',
+                        type=str,
+                        default=None,
+                        help='path to sample output file.')
+    parser.add_argument(
+        '--disable_random_seed',
+        dest='random_seed',
+        action='store_false',
+        help='Disable the use of random seed for sentences in a batch.')
+    parser.add_argument('--skip_end_tokens',
+                        dest='skip_end_tokens',
+                        action='store_false',
+                        help='Whether to remove or not end tokens in outputs.')
+    parser.add_argument('--no_detokenize',
+                        dest='detokenize',
+                        action='store_false',
+                        help='Skip detokenizing output token ids.')
+    parser.add_argument(
+        '--int8_mode',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help='The level of quantization to perform.' +
+        ' 0: No quantization. All computation in data_type' +
+        ' 1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32'
+    )
+    parser.add_argument(
+        '--weights_data_type',
+        type=str,
+        default='fp32',
+        choices=['fp32', 'fp16'],
+        help='Data type of FT checkpoint weights',
+    )
+    parser.add_argument(
+        '--return_cum_log_probs',
+        type=int,
+        default=0,
+        choices=[0, 1, 2],
+        help='Whether to compute the cumulative log probsbility of sentences.' +
+        ' 0: do not return the cumulative log probs' +
+        ' 1: return the cumulative log probs of generated sequences' +
+        ' 2: return the cumulative log probs of sequences')
+    parser.add_argument('--shared_contexts_ratio',
+                        type=float,
+                        default=0.0,
+                        help='Triggers the shared context optimization when ' +
+                        'compact_size <= shared_contexts_ratio * batch_size ' +
+                        'A value of 0.0 deactivate the optimization')
+    parser.add_argument(
+        '--use_gpt_decoder_ops',
+        action='store_true',
+        help='Use separate decoder FT operators instead of end-to-end model op.'
+    )
+    parser.add_argument(
+        '--no-alibi',
+        dest='alibi',
+        action='store_false',
+        help='Do not use ALiBi (aka use_attention_linear_bias).')
+    parser.add_argument(
+        '--layernorm_eps',
+        type=float,
+        default=1e-5,
+        help='layernorm eps in PyTorch, by default, is 1e-5 and 1e-6 in FT.')
+    args = parser.parse_args()
+    ckpt_config = configparser.ConfigParser()
+    ckpt_config_path = os.path.join(args.ckpt_path, 'config.ini')
+    if os.path.isfile(ckpt_config_path):
+        ckpt_config.read(ckpt_config_path)
+    if 'gpt' in ckpt_config.keys():
+        for args_key, config_key, func in [
+            ('layer_num', 'num_layer', ckpt_config.getint),
+            ('max_seq_len', 'max_pos_seq_len', ckpt_config.getint),
+            ('weights_data_type', 'weight_data_type', ckpt_config.get),
+            ('layernorm_eps', 'layernorm_eps', ckpt_config.getfloat),
+            ('alibi', 'use_attention_linear_bias', ckpt_config.getboolean),
+        ]:
+            if config_key in ckpt_config['gpt'].keys():
+                prev_val = args.__dict__[args_key]
+                args.__dict__[args_key] = func('gpt', config_key)
+                print(
+                    'Loading {} from config.ini,    previous: {},    current: {}'
+                    .format(args_key, prev_val, args.__dict__[args_key]))
+            else:
+                print('Not loading {} from config.ini'.format(args_key))
+        for key in ['head_num', 'size_per_head', 'tensor_para_size']:
+            if key in args.__dict__:
+                prev_val = args.__dict__[key]
+                args.__dict__[key] = ckpt_config.getint('gpt', key)
+                print(
+                    'Loading {} from config.ini,    previous: {},    current: {}'
+                    .format(key, prev_val, args.__dict__[key]))
+            else:
+                print('Not loading {} from config.ini'.format(key))
+    layer_num = args.layer_num
+    output_len = args.output_len
+    head_num = args.head_num
+    size_per_head = args.size_per_head
+    vocab_size = args.vocab_size
+    beam_width = args.beam_width
+    top_k = args.top_k
+    top_p = args.top_p
+    temperature = args.temperature
+    len_penalty = args.len_penalty
+    beam_search_diversity_rate = args.beam_search_diversity_rate
+    tensor_para_size = args.tensor_para_size
+    pipeline_para_size = args.pipeline_para_size
+    start_id = args.start_id
+    end_id = args.end_id
+    max_batch_size = args.max_batch_size
+    max_seq_len = args.max_seq_len
+    repetition_penalty = args.repetition_penalty
+    presence_penalty = args.presence_penalty
+    min_length = args.min_length
+    weights_data_type = args.weights_data_type
+    return_cum_log_probs = args.return_cum_log_probs
+    return_output_length = return_cum_log_probs > 0
+    shared_contexts_ratio = args.shared_contexts_ratio
+    layernorm_eps = args.layernorm_eps
+    use_attention_linear_bias = args.alibi
+    has_positional_encoding = not args.alibi
+    print('\n=================== Arguments ===================')
+    for k, v in vars(args).items():
+        print(f'{k.ljust(30, ".")}: {v}')
+    print('=================================================\n')
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)
+    torch.manual_seed(0)
+    comm.initialize_model_parallel(args.tensor_para_size,
+                                   args.pipeline_para_size)
+    rank = comm.get_rank()
+    device = comm.get_device()
+    # Inputs
+    contexts = []
+    if args.sample_input_file:
+        with open(args.sample_input_file, 'r') as f:
+            contexts = f.read().splitlines()
+            batch_size = min(len(contexts), max_batch_size)
+        contexts = contexts[:batch_size]
+        start_ids = [
+            torch.tensor(tokenizer.encode(c), dtype=torch.int32, device=device)
+            for c in contexts
+        ]
+    else:
+        batch_size = max_batch_size
+        contexts = ['<|endoftext|>'] * batch_size
+        start_ids = [torch.IntTensor([end_id for _ in range(args.input_len)])
+                    ] * batch_size
+    start_lengths = [len(ids) for ids in start_ids]
+    start_ids = pad_sequence(start_ids, batch_first=True, padding_value=end_id)
+    start_lengths = torch.IntTensor(start_lengths)
+    # Prepare model.
+    if not args.use_gpt_decoder_ops:
+        gpt = ParallelGPT(head_num,
+                          size_per_head,
+                          vocab_size,
+                          start_id,
+                          end_id,
+                          layer_num,
+                          max_seq_len,
+                          tensor_para_size,
+                          pipeline_para_size,
+                          lib_path=args.lib_path,
+                          inference_data_type=args.inference_data_type,
+                          int8_mode=args.int8_mode,
+                          weights_data_type=weights_data_type,
+                          layernorm_eps=layernorm_eps,
+                          use_attention_linear_bias=use_attention_linear_bias,
+                          has_positional_encoding=has_positional_encoding,
+                          shared_contexts_ratio=shared_contexts_ratio)
+        if not gpt.load(ckpt_path=args.ckpt_path):
+            print(
+                '[WARNING] Checkpoint file not found. Model loading is skipped.'
+            )
+    else:
+        gpt = gpt_decoder.Gpt(num_heads=head_num,
+                              size_per_head=size_per_head,
+                              num_layers=layer_num,
+                              vocab_size=vocab_size,
+                              start_id=start_id,
+                              end_id=end_id,
+                              tensor_para_size=tensor_para_size,
+                              pipeline_para_size=pipeline_para_size,
+                              lib_path=args.lib_path,
+                              max_seq_len=max_seq_len,
+                              int8_mode=args.int8_mode,
+                              weights_data_type=args.weights_data_type)
+        gpt.load(args.ckpt_path, args.inference_data_type)
+    if args.random_seed:
+        random_seed_tensor = torch.randint(0,
+                                           10000,
+                                           size=[batch_size],
+                                           dtype=torch.int64)
+    else:
+        random_seed_tensor = torch.zeros([batch_size], dtype=torch.int64)
+    repetition_penalty_vec = None if repetition_penalty == 1. else repetition_penalty * torch.ones(
+        batch_size, dtype=torch.float32)
+    presence_penalty_vec = None if presence_penalty == 0. else presence_penalty * torch.ones(
+        batch_size, dtype=torch.float32)
+    infer_decode_args = {
+        'beam_width':
+            beam_width,
+        'top_k':
+            top_k * torch.ones(batch_size, dtype=torch.int32),
+        'top_p':
+            top_p * torch.ones(batch_size, dtype=torch.float32),
+        'temperature':
+            temperature * torch.ones(batch_size, dtype=torch.float32),
+        'repetition_penalty':
+            repetition_penalty_vec,
+        'presence_penalty':
+            presence_penalty_vec,
+        'beam_search_diversity_rate':
+            beam_search_diversity_rate *
+            torch.ones(batch_size, dtype=torch.float32),
+        'len_penalty':
+            len_penalty * torch.ones(size=[batch_size], dtype=torch.float32),
+        'bad_words_list':
+            None,
+        'min_length':
+            min_length * torch.ones(size=[batch_size], dtype=torch.int32),
+        'random_seed':
+            random_seed_tensor
+    }
+    if not args.use_gpt_decoder_ops:
+        def gpt_generate_fn():
+            tokens_batch = gpt(start_ids,
+                               start_lengths,
+                               output_len,
+                               return_output_length=return_output_length,
+                               return_cum_log_probs=return_cum_log_probs,
+                               **infer_decode_args)
+            return tokens_batch
+    else:
+        def gpt_generate_fn():
+            output_dict = gpt.generate(
+                input_token_ids=start_ids,
+                input_lengths=start_lengths,
+                gen_length=output_len,
+                eos_token_id=end_id,
+                return_output_length=return_output_length,
+                return_log_probs=return_cum_log_probs,
+                **infer_decode_args)
+            return output_dict
+    # Generate tokens.
+    gen_outputs = gpt_generate_fn()
+    if rank == 0:
+        if not args.use_gpt_decoder_ops:
+            if return_cum_log_probs > 0:
+                tokens_batch, _, cum_log_probs = gen_outputs
+            else:
+                tokens_batch, cum_log_probs = gen_outputs, None
+        else:
+            tokens_batch = gen_outputs['output_token_ids']
+            cum_log_probs = gen_outputs[
+                'cum_log_probs'] if return_cum_log_probs > 0 else None
+        if cum_log_probs is not None:
+            print('[INFO] Log probs of sentences:', cum_log_probs)
+        outputs = []
+        tokens_batch = tokens_batch.cpu().numpy()
+        for i, (context, tokens) in enumerate(zip(contexts, tokens_batch)):
+            for beam_id in range(beam_width):
+                token = tokens[beam_id][
+                    start_lengths[i]:]  # exclude context input from the output
+                if args.skip_end_tokens:
+                    token = token[token != end_id]
+                output = tokenizer.decode(
+                    token) if args.detokenize else ' '.join(
+                        str(t) for t in token.tolist())
+                outputs.append(output)
+                print(
+                    f'[INFO] batch {i}, beam {beam_id}:\n[Context]\n{context}\n\n[Output]\n{output}\n'
+                )
+        if args.sample_output_file:
+            with open(args.sample_output_file, 'w+') as f:
+                outputs = [o.replace('\n', '\\n') for o in outputs]
+                f.writelines('\n'.join(outputs))
+    # Measure inference time.
+    if args.time:
+        warmup_iterations = 10
+        for _ in range(warmup_iterations):
+            gpt_generate_fn()
+        torch.cuda.synchronize()
+        measurement_iterations = 10
+        time = timeit.default_timer()
+        for _ in range(measurement_iterations):
+            gpt_generate_fn()
+        torch.cuda.synchronize()
+        time_elapsed = timeit.default_timer() - time
+        if rank == 0:
+            print(f'[INFO] MPT time costs:')
+            print(
+                'model_name, gpu_type, gpu_count, batch_size, input_tokens, output_tokens, latency_ms'
+            )
+            print(
+                f'{ckpt_config.get("gpt", "model_name")}, {torch.cuda.get_device_name().replace(" ", "-")}, {torch.cuda.device_count()}, {batch_size}, {args.input_len}, {args.output_len}, {time_elapsed * 1000 / measurement_iterations:.2f}'
+            )
+if __name__ == '__main__':
+    main()

Perceptrix/finetune/build/lib/llmfoundry/__init__.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import torch
+try:
+    from llmfoundry import optim, utils
+    from llmfoundry.data import (ConcatTokensDataset,
+                                 MixtureOfDenoisersCollator, NoConcatDataset,
+                                 Seq2SeqFinetuningCollator,
+                                 build_finetuning_dataloader,
+                                 build_text_denoising_dataloader)
+    from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM,
+                                      ComposerHFT5)
+    from llmfoundry.models.layers.attention import (
+        MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
+        flash_attn_fn, scaled_multihead_dot_product_attention,
+        triton_flash_attn_fn)
+    from llmfoundry.models.layers.blocks import MPTBlock
+    from llmfoundry.models.layers.ffn import (FFN_CLASS_REGISTRY, MPTMLP,
+                                              build_ffn)
+    from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
+    from llmfoundry.models.mpt import (ComposerMPTCausalLM, MPTConfig,
+                                       MPTForCausalLM, MPTModel,
+                                       MPTPreTrainedModel)
+    from llmfoundry.tokenizers import TiktokenTokenizerWrapper
+except ImportError as e:
+    try:
+        is_cuda_available = torch.cuda.is_available()
+    except:
+        is_cuda_available = False
+    extras = '.[gpu]' if is_cuda_available else '.'
+    raise ImportError(
+        f'Please make sure to pip install {extras} to get the requirements for the LLM example.'
+    ) from e
+__all__ = [
+    'build_text_denoising_dataloader',
+    'build_finetuning_dataloader',
+    'MixtureOfDenoisersCollator',
+    'Seq2SeqFinetuningCollator',
+    'MPTBlock',
+    'FFN_CLASS_REGISTRY',
+    'MPTMLP',
+    'build_ffn',
+    'MPTConfig',
+    'MPTPreTrainedModel',
+    'MPTModel',
+    'MPTForCausalLM',
+    'ComposerMPTCausalLM',
+    'ComposerHFCausalLM',
+    'ComposerHFPrefixLM',
+    'ComposerHFT5',
+    'COMPOSER_MODEL_REGISTRY',
+    'scaled_multihead_dot_product_attention',
+    'flash_attn_fn',
+    'triton_flash_attn_fn',
+    'MultiheadAttention',
+    'NoConcatDataset',
+    'ConcatTokensDataset',
+    'attn_bias_shape',
+    'build_attn_bias',
+    'build_alibi_bias',
+    'optim',
+    'utils',
+    'TiktokenTokenizerWrapper',
+]
+__version__ = '0.3.0'

Perceptrix/finetune/build/lib/llmfoundry/callbacks/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+try:
+    from llmfoundry.callbacks.eval_gauntlet_callback import EvalGauntlet
+    from llmfoundry.callbacks.fdiff_callback import FDiffMetrics
+    from llmfoundry.callbacks.generate_callback import Generate
+    from llmfoundry.callbacks.hf_checkpointer import HuggingFaceCheckpointer
+    from llmfoundry.callbacks.model_gauntlet_callback import ModelGauntlet
+    from llmfoundry.callbacks.monolithic_ckpt_callback import \
+        MonolithicCheckpointSaver
+    from llmfoundry.callbacks.resumption_callbacks import (GlobalLRScaling,
+                                                           LayerFreezing)
+    from llmfoundry.callbacks.scheduled_gc_callback import \
+        ScheduledGarbageCollector
+except ImportError as e:
+    raise ImportError(
+        'Please make sure to pip install . to get requirements for llm-foundry.'
+    ) from e
+__all__ = [
+    'FDiffMetrics',
+    'Generate',
+    'MonolithicCheckpointSaver',
+    'GlobalLRScaling',
+    'LayerFreezing',
+    'ScheduledGarbageCollector',
+    'EvalGauntlet',
+    'ModelGauntlet',
+    'HuggingFaceCheckpointer',
+]

Perceptrix/finetune/build/lib/llmfoundry/callbacks/eval_gauntlet_callback.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Aggregate ICL evals into composite scores."""
+import logging
+import math
+from enum import Enum
+from typing import Dict, Optional
+from composer.core import Callback, State
+from composer.loggers import Logger
+__all__ = ['EvalGauntlet']
+log = logging.getLogger(__name__)
+class Weighting(Enum):
+    EQUAL = 1
+    SAMPLE_SZ = 2
+    LOG_SAMPLE_SZ = 3
+class EvalGauntlet(Callback):
+    """The EvalGauntlet aggregates ICL eval results.
+    After `eval_end`, this callback inspects the logger for different ICL metrics and aggregates the scores according to the aggregation
+    specification provided in the constructor.
+    Args:
+        logger_keys (list): These are the exact keys that the individual benchmark metrics will be
+                            logged under in the logger after eval
+        tasks (dict): This contains the list of categories, as well as the subtasks within them, the
+                      random baseline accuracy of each subtask, and the number of fewshot examples
+                      used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet.yaml` to see the structure.
+        weighting (Weighting): The weighting scheme used to balance different tasks within each category.
+                               Either assign them all equal weight, assign them weight proportional
+                               to the dataset size, or assign them weight proportional to the log2 of the dataset size.
+                               Options are 'EQUAL', 'SAMPLE_SZ', and 'LOG_SAMPLE_SZ'.
+        subtract_random_baseline (bool): Flag determining whether to subtract random baseline accuracy
+                                          from the performance on each individual benchmark before aggregating.
+        rescale_accuracy (bool): Flag determining whether to rescale the accuracy on each benchmark
+                                 by (1-random_baseline_accuracy) before aggregating. Using this ensures that all benchmarks max out at 1.0.
+        benchmark_sizes (Optional[dict]): Optional data on benchmark sizes, used when not relying on equal weighting.
+    """
+    def __init__(self,
+                 logger_keys: list,
+                 categories: dict,
+                 weighting: str = 'EQUAL',
+                 subtract_random_baseline: bool = True,
+                 rescale_accuracy: bool = True,
+                 benchmark_sizes: Optional[dict] = None):
+        if isinstance(logger_keys, dict):
+            raise ValueError(
+                'logger_keys now requires a list type as input, not a dict')
+        if weighting != Weighting.EQUAL and benchmark_sizes is None:
+            raise Exception(
+                'When not using equal weighting, you must provide the benchmark sizes.'
+            )
+        if rescale_accuracy and not subtract_random_baseline:
+            raise Exception(
+                'Only use accuracy rescaling in conjunction with subtracting random baseline accuracy.'
+            )
+        self.categories = categories
+        self.weighting = Weighting[weighting]
+        self.subtract_random_baseline = subtract_random_baseline
+        self.rescale_accuracy = rescale_accuracy
+        self.logger_keys = logger_keys
+        for category in self.categories:
+            for benchmark in category['benchmarks']:
+                bench_name = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"
+                if self.weighting != Weighting.EQUAL:
+                    assert benchmark_sizes is not None
+                    cumulative_samples = max(
+                        sum(count for name, count in benchmark_sizes.items()
+                            if name.startswith(bench_name)), 1)
+                else:
+                    cumulative_samples = -1  # pyright
+                weight = None
+                if self.weighting == Weighting.EQUAL:
+                    weight = 1
+                elif self.weighting == Weighting.SAMPLE_SZ:
+                    weight = cumulative_samples
+                elif self.weighting == Weighting.LOG_SAMPLE_SZ:
+                    weight = max(math.log(cumulative_samples, 2), 1)
+                assert weight is not None
+                benchmark['weighting'] = weight
+    def compute_averages(self, state: State) -> Dict[str, float]:
+        results = {}
+        for key in self.logger_keys:
+            # starting at index 1 skips the "metric" part of the key which is superfluous
+            dl_name, metric_name = key.split('/')[1:-1], key.split('/')[-1]
+            if 'Accuracy' not in metric_name:
+                continue
+            metric = state.eval_metrics.get('/'.join(dl_name),
+                                            {}).get(metric_name, None)
+            if metric is None:
+                continue
+            val = metric.compute().item()
+            # ending at index 2 allows us to aggregate over dataloaders w/ subcategories
+            key = '/'.join(dl_name[0:2])
+            if key not in results:
+                results[key] = []
+            results[key].append(val)
+        return {k: sum(v) / len(v) for k, v in results.items()}
+    def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
+        new_metrics = self.compute_averages(state)
+        if len(new_metrics) == 0:
+            return {}
+        composite_scores = {}
+        for category in self.categories:
+            missing_metrics = []
+            composite_scores[category['name']] = []
+            for benchmark in category['benchmarks']:
+                key = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"
+                if key not in new_metrics:
+                    log.warning(
+                        f'Could not find results for benchmark: {benchmark}.')
+                    missing_metrics.append(key)
+                else:
+                    score = new_metrics[key]
+                    if self.subtract_random_baseline:
+                        score -= benchmark['random_baseline']
+                    if self.rescale_accuracy and self.subtract_random_baseline:
+                        score /= 1.0 - benchmark['random_baseline']
+                    composite_scores[category['name']].append({
+                        'name': benchmark['name'],
+                        'score': score,
+                        'weighting': benchmark['weighting']
+                    })
+            if len(missing_metrics) > 0:
+                log.warning(
+                    f"Removing category `{category['name']}` from scores because benchmarks were missing: {missing_metrics}"
+                )
+                del composite_scores[category['name']]
+                continue
+            total_weight = sum(
+                k['weighting'] for k in composite_scores[category['name']])
+            composite_scores[category['name']] = sum(
+                k['score'] * (k['weighting'] / total_weight)
+                for k in composite_scores[category['name']])
+        composite_scores = {
+            f'icl/metrics/eval_gauntlet/{k}': v
+            for k, v in composite_scores.items()
+        }
+        composite_scores['icl/metrics/eval_gauntlet/average'] = sum(
+            composite_scores.values()) / len(composite_scores.values()) if len(
+                composite_scores.values()) > 0 else 0
+        if logger is not None:
+            logger.log_metrics(composite_scores)
+        return composite_scores

Perceptrix/finetune/build/lib/llmfoundry/callbacks/fdiff_callback.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Monitor rate of change of loss."""
+from __future__ import annotations
+import torch
+from composer.core import Callback, State
+from composer.loggers import Logger
+class FDiffMetrics(Callback):
+    """Rate of change of metrics.
+    tracks and plots the rate of change of metrics effectively taking the
+    numerical derivative of the metrics
+    """
+    def __init__(self,
+                 diff_train_metrics: bool = False,
+                 diff_eval_metrics: bool = True):
+        self.diff_train_metrics = diff_train_metrics
+        self.diff_eval_metrics = diff_eval_metrics
+        self.train_prev_loss = None
+        self.train_prev_metric = {}
+        self.eval_prev_metric = {}
+    def batch_end(self, state: State, logger: Logger) -> None:
+        if self.diff_train_metrics:
+            if not isinstance(state.loss, torch.Tensor):
+                raise NotImplementedError('Multiple losses not supported yet')
+            loss = state.loss.item()
+            if self.train_prev_loss:
+                logger.log_metrics(
+                    {'loss/train/total_fdiff': loss - self.train_prev_loss})
+            self.train_prev_loss = loss
+            for k in self.train_prev_metric.keys():
+                logger.log_metrics({
+                    f'metrics/train/{k}_fdiff':
+                        state.train_metric_values[k] - self.train_prev_metric[k]
+                })
+            for k in state.train_metric_values.keys():
+                value = state.train_metric_values[k]
+                self.train_prev_metric[k] = value
+    def eval_end(self, state: State, logger: Logger) -> None:
+        if self.diff_eval_metrics:
+            evaluator = state.dataloader_label
+            assert evaluator is not None, 'dataloader should have been set'
+            metrics = list(state.eval_metrics[evaluator].keys())
+            for k in metrics:
+                mkey = '/'.join(['metrics', evaluator, k])
+                if mkey in self.eval_prev_metric.keys():
+                    logger.log_metrics({
+                        f'{mkey}_fdiff':
+                            state.eval_metric_values[k] -
+                            self.eval_prev_metric[mkey]
+                    })
+            for k in metrics:
+                mkey = '/'.join(['metrics', evaluator, k])
+                self.eval_prev_metric[mkey] = state.eval_metric_values[k]

Perceptrix/finetune/build/lib/llmfoundry/callbacks/generate_callback.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Deprecated Generate callback.
+Please use composer.callbacks.Generate instead.
+"""
+import warnings
+from typing import Any, List, Union
+from composer.callbacks import Generate as ComposerGenerate
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+class Generate(ComposerGenerate):
+    def __init__(self, prompts: List[str], batch_log_interval: int,
+                 **kwargs: Any):
+        warnings.warn(
+            ('Accessing llmfoundry.callbacks.generate_callback.Generate '
+             'is deprecated and will be removed in a future release. '
+             'Please use composer.callbacks.Generate instead.'),
+            DeprecationWarning,
+        )
+        interval = f'{batch_log_interval}ba'
+        super().__init__(prompts=prompts, interval=interval, **kwargs)

Perceptrix/finetune/build/lib/llmfoundry/callbacks/hf_checkpointer.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import contextlib
+import json
+import logging
+import os
+import tempfile
+from pathlib import Path
+from typing import Optional, Union
+import torch
+from composer.callbacks.utils import create_interval_scheduler
+from composer.core import Callback, Event, State, Time
+from composer.core.state import fsdp_state_dict_type_context
+from composer.loggers import Logger
+from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader
+from composer.models import HuggingFaceModel
+from composer.utils import dist, format_name_with_dist_and_time, parse_uri
+from transformers import PreTrainedTokenizerBase
+from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM
+from llmfoundry.utils.huggingface_hub_utils import \
+    edit_files_for_hf_compatibility
+log = logging.getLogger(__name__)
+class HuggingFaceCheckpointer(Callback):
+    """Save a huggingface formatted checkpoint during training.
+    Args:
+        save_folder (str): Top level folder to save checkpoints to (can be a URI). It is likely that
+            this would be the same as your save_folder.
+        save_interval: Union[str, int, Time]: The interval describing how often checkpoints should be
+            saved. If an integer, it will be assumed to be in :attr:`.TimeUnit.EPOCH`.
+            Otherwise, the unit must be either :attr:`.TimeUnit.EPOCH`, :attr:`.TimeUnit.BATCH`,
+            :attr:`.TimeUnit.TOKEN`, or :attr:`.TimeUnit.SAMPLE`.
+        huggingface_folder_name (str): Folder to save each checkpoint under (can be a format string). Default is ``ba{batch}``.
+        precision: The precision to save the model in. Default is ``float32``. Options are ``bfloat16``, ``float16``, or ``float32``.
+        overwrite (bool): Whether to overwrite previous checkpoints.
+    """
+    def __init__(
+        self,
+        save_folder: str,
+        save_interval: Union[str, int, Time],
+        huggingface_folder_name: str = 'ba{batch}',
+        precision: str = 'float32',
+        overwrite: bool = False,
+    ):
+        self.backend, self.bucket_name, self.save_dir_format_str = parse_uri(
+            save_folder)
+        self.overwrite = overwrite
+        self.precision = precision
+        self.dtype = {
+            'float32': torch.float32,
+            'float16': torch.float16,
+            'bfloat16': torch.bfloat16,
+        }[precision]
+        self.huggingface_folder_name_fstr = os.path.join(
+            'huggingface', huggingface_folder_name)
+        self.check_interval = create_interval_scheduler(
+            save_interval, include_end_of_training=True)
+        self.upload_to_object_store = (self.backend != '')
+        if self.upload_to_object_store:
+            self.remote_ud = RemoteUploaderDownloader(
+                bucket_uri=f'{self.backend}://{self.bucket_name}',
+                num_concurrent_uploads=4)
+        else:
+            self.remote_ud = None
+        self.last_checkpoint_batch: Optional[Time] = None
+    def run_event(self, event: Event, state: State, logger: Logger) -> None:
+        # The interval scheduler handles only returning True for the appropriate events
+        if state.get_elapsed_duration() is not None and self.check_interval(
+                state,
+                event) and self.last_checkpoint_batch != state.timestamp.batch:
+            self._save_checkpoint(state, logger)
+        elif event == Event.INIT:
+            if not isinstance(state.model, HuggingFaceModel):
+                raise ValueError(
+                    f'`HuggingFaceCheckpointer` is only compatible with `HuggingFaceModel`s. '
+                    + f'Got {type(state.model)} instead.')
+            if self.upload_to_object_store and self.remote_ud is not None:
+                self.remote_ud.init(state, logger)
+                state.callbacks.append(self.remote_ud)
+    def _save_checkpoint(self, state: State, logger: Logger):
+        del logger  # unused
+        self.last_checkpoint_batch = state.timestamp.batch
+        log.info('Saving HuggingFace formatted checkpoint')
+        from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+        CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
+        MPTConfig.register_for_auto_class()
+        MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
+        assert isinstance(state.model, HuggingFaceModel)
+        save_dir = format_name_with_dist_and_time(
+            str(
+                Path(self.save_dir_format_str) /
+                self.huggingface_folder_name_fstr), state.run_name,
+            state.timestamp)
+        dir_context_mgr = tempfile.TemporaryDirectory(
+        ) if self.upload_to_object_store else contextlib.nullcontext(
+            enter_result=save_dir)
+        with dir_context_mgr as temp_save_dir:
+            assert isinstance(temp_save_dir,
+                              str)  # pyright doesn't know about enter_result
+            with fsdp_state_dict_type_context(state.model.model,
+                                              state_dict_type='full'):
+                state_dict = state.model.model.state_dict()
+                # convert the state dict to the requested precision
+                for k, v in state_dict.items():
+                    if isinstance(v, torch.Tensor):
+                        state_dict[k] = v.to(dtype=self.dtype)
+            if dist.get_global_rank() == 0:
+                # We raise above if the model is not a HuggingFaceModel, so this assert is safe
+                assert hasattr(state.model.model, 'save_pretrained')
+                state.model.model.save_pretrained(temp_save_dir,
+                                                  state_dict=state_dict)
+                if state.model.tokenizer is not None:
+                    assert isinstance(state.model.tokenizer,
+                                      PreTrainedTokenizerBase)
+                    state.model.tokenizer.save_pretrained(temp_save_dir)
+                # Only need to edit files for MPT because it has custom code
+                if state.model.model.config.model_type == 'mpt':
+                    edit_files_for_hf_compatibility(temp_save_dir)
+                with open(os.path.join(temp_save_dir, 'config.json'), 'r') as f:
+                    edited_config = json.load(f)
+                if state.model.model.config.model_type == 'mpt':
+                    edited_config['attn_config']['attn_impl'] = 'torch'
+                    edited_config['init_device'] = 'cpu'
+                edited_config['torch_dtype'] = self.precision
+                with open(os.path.join(temp_save_dir, 'config.json'), 'w') as f:
+                    json.dump(edited_config, f, indent=4)
+                if self.upload_to_object_store:
+                    assert self.remote_ud is not None
+                    # TODO change to log after other pr
+                    log.info(
+                        f'Uploading HuggingFace formatted checkpoint to {self.backend}://{self.bucket_name}/{save_dir}'
+                    )
+                    for filename in os.listdir(temp_save_dir):
+                        self.remote_ud.upload_file(
+                            state=state,
+                            remote_file_name=os.path.join(save_dir, filename),
+                            file_path=Path(os.path.join(temp_save_dir,
+                                                        filename)),
+                            overwrite=self.overwrite,
+                        )
+        dist.barrier()

Perceptrix/finetune/build/lib/llmfoundry/callbacks/model_gauntlet_callback.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from composer.core import Callback
+__all__ = ['ModelGauntlet']
+class ModelGauntlet(Callback):
+    """The ModelGauntlet callback has been renamed to EvalGauntlet.
+    We've created this dummy class, in order to alert anyone who may have been
+    importing ModelGauntlet.
+    """
+    def __init__(
+        self,
+        *args,  # pyright: ignore [reportMissingParameterType]
+        **kwargs):  # pyright: ignore [reportMissingParameterType]
+        raise ImportError(
+            'ModelGauntlet class is deprecated, please use EvalGauntlet')

Perceptrix/finetune/build/lib/llmfoundry/callbacks/monolithic_ckpt_callback.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import contextlib
+import os
+import tempfile
+from pathlib import Path
+import torch
+from composer.core import Callback, State
+from composer.core.state import (fsdp_get_optim_state_dict,
+                                 fsdp_state_dict_type_context)
+from composer.loggers import Logger
+from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader
+from composer.utils import (dist, format_name_with_dist_and_time, parse_uri,
+                            reproducibility)
+class MonolithicCheckpointSaver(Callback):
+    """Save a monolithic checkpoint every N batches.
+    Args:
+        save_folder (str): Folder to save checkpoints to (can be a URI)
+        filename (str): Filename to save checkpoints to.
+        batch_interval (int): Number of batches between checkpoints.
+        overwrite (bool): Whether to overwrite previous checkpoints.
+        keep_optimizer(bool): Whether to save the optimizer state in the monolithic checkpoint.
+    """
+    def __init__(self,
+                 save_folder: str,
+                 batch_interval: int,
+                 filename: str = 'ep{epoch}-ba{batch}.pt',
+                 overwrite: bool = False,
+                 keep_optimizers: bool = False):
+        self.backend, self.bucket_name, self.save_dir_format_str = parse_uri(
+            save_folder)
+        self.filename_format_str = filename
+        self.batch_interval = batch_interval
+        self.upload_to_object_store = (self.backend != '')
+        self.overwrite = overwrite
+        self.keep_optimizers = keep_optimizers
+        if self.upload_to_object_store:
+            self.remote_ud = RemoteUploaderDownloader(
+                bucket_uri=f'{self.backend}://{self.bucket_name}')
+        else:
+            self.remote_ud = None
+    def init(self, state: State, logger: Logger) -> None:
+        if self.upload_to_object_store and self.remote_ud is not None:
+            self.remote_ud.init(state, logger)
+            # updated_logger_destinations = [*logger.destinations, new_remote_ud]
+            # logger.destinations = tuple(updated_logger_destinations)
+            state.callbacks.append(self.remote_ud)
+    def batch_checkpoint(self, state: State, logger: Logger) -> None:
+        if state.timestamp.batch.value % self.batch_interval == 0:
+            self._save_checkpoint(state, logger)
+    def fit_end(self, state: State, logger: Logger) -> None:
+        if state.timestamp.batch.value % self.batch_interval != 0:
+            self._save_checkpoint(state, logger)
+    def _save_checkpoint(self, state: State, logger: Logger) -> None:
+        del logger  # unused
+        filename = format_name_with_dist_and_time(self.filename_format_str,
+                                                  state.run_name,
+                                                  state.timestamp)
+        save_dir = format_name_with_dist_and_time(self.save_dir_format_str,
+                                                  state.run_name,
+                                                  state.timestamp)
+        dir_context_mgr = tempfile.TemporaryDirectory(
+        ) if self.upload_to_object_store else contextlib.nullcontext(
+            enter_result=save_dir)
+        with dir_context_mgr as temp_save_dir:
+            # pyright doesn't know about enter_result
+            assert isinstance(temp_save_dir, str)
+            save_path = str(Path(temp_save_dir) / Path(filename))
+            dirname = os.path.dirname(save_path)
+            if dirname:
+                os.makedirs(dirname, exist_ok=True)
+            state_dict = {
+                'state': state.state_dict(),
+                'rng': reproducibility.get_rng_state()
+            }
+            # Remove sharded model and optimizer state dicts
+            state_dict['state'].pop('optimizers')
+            state_dict['state'].pop('model')
+            # Add in unsharded model params.
+            with fsdp_state_dict_type_context(state.model,
+                                              state_dict_type='full'):
+                state_dict['state']['model'] = state.model.state_dict()
+            # Add in unsharded optimizer state dict.
+            if self.keep_optimizers:
+                optimizer = state.optimizers[0]
+                state_dict['state']['optimizers'] = {
+                    type(optimizer).__qualname__:
+                        fsdp_get_optim_state_dict(state.model,
+                                                  optimizer,
+                                                  state_dict_type='full')
+                }
+            if dist.get_global_rank() == 0:
+                torch.save(state_dict, save_path)
+            if self.upload_to_object_store and self.remote_ud is not None and dist.get_global_rank(
+            ) == 0:
+                remote_file_name = str(Path(save_dir) / Path(filename))
+                self.remote_ud.upload_file(state=state,
+                                           remote_file_name=remote_file_name,
+                                           file_path=Path(save_path),
+                                           overwrite=self.overwrite)

Perceptrix/finetune/build/lib/llmfoundry/callbacks/resumption_callbacks.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import List
+from composer.core import Callback, State
+from composer.loggers import Logger
+__all__ = [
+    'GlobalLRScaling',
+    'LayerFreezing',
+]
+log = logging.getLogger(__name__)
+class GlobalLRScaling(Callback):
+    """GlobalLRScaling.
+    This callback can be applied upon resuming a model checkpoint. Upon
+    fit_start it will multiply the base LR by `lr_scale` and set the WD to be.
+    `wd_pct` * `lr`.
+    Args:
+        lr_scale (float): Multiplicative factor to scale LR by
+        wd_pct (float): Percentage of LR to set weight decay to.
+    """
+    def __init__(self, lr_scale: float, wd_pct: float = 0.0):
+        self.lr_scale = lr_scale
+        self.wd_pct = wd_pct
+    def fit_start(self, state: State, logger: Logger) -> None:
+        del logger  # unused
+        if hasattr(state, 'optimizer') and state.optimizers is None:
+            raise Exception('No optimizers defined')
+        for optimizer in state.optimizers:
+            for group in optimizer.param_groups:
+                group['lr'] *= self.lr_scale
+                group['weight_decay'] = group['lr'] * self.wd_pct
+                if 'initial_lr' in group:
+                    group['initial_lr'] *= self.lr_scale
+                log.info(
+                    f"Set LR and WD to {group['lr']}, {group['weight_decay']}")
+        for scheduler in state.schedulers:
+            scheduler.base_lrs = [
+                self.lr_scale * lr for lr in scheduler.base_lrs
+            ]
+class LayerFreezing(Callback):
+    """LayerFreezing.
+    This callback can be applied upon resuming a model checkpoint. Upon
+    fit_start it freeze the layers specified in `layer_names`. If using
+    activation checkpointing, please set the
+    `activation_checkpointing_reentrant` flag in `fsdp_config` to false.
+    Args:
+        layer_names (float): Names of layers to freeze.
+    """
+    def __init__(self, layer_names: List[str]):
+        self.layer_names = set(layer_names)
+    def fit_start(self, state: State, logger: Logger) -> None:
+        del logger  # unused
+        model_layers = set(name for name, _ in state.model.named_parameters())
+        for layer in self.layer_names:
+            if layer not in model_layers:
+                raise Exception(
+                    f'Attempted to freeze layer not found in model: {layer}\nAvailable layers: {model_layers}'
+                )
+        successful_freeze = False
+        for name, p in state.model.named_parameters():
+            if p.requires_grad and name in self.layer_names:
+                p.requires_grad = False
+                log.debug(f'Froze layer: {name}\nParam: {p}')
+                successful_freeze = True
+        if not successful_freeze:
+            raise Exception(
+                f"Tried to run LayerFreezing but didn't freeze any layers")

Perceptrix/finetune/build/lib/llmfoundry/callbacks/scheduled_gc_callback.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import gc
+import torch
+from composer.core import Callback, State
+from composer.loggers import Logger
+def gc_cuda():
+    """Garbage collect Torch (CUDA) memory."""
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+class ScheduledGarbageCollector(Callback):
+    """Disable automatic garbage collection and collect garbage at interval.
+    Args:
+        batch_interval (int): Number of batches between checkpoints call to gc.collect()
+        eval_keep_disabled (bool): keep gc disabled during eval (default: False)
+    """
+    def __init__(
+        self,
+        batch_interval: int,
+        eval_keep_disabled: bool = False,
+    ):
+        self.batch_interval = batch_interval
+        self.eval_keep_disabled = eval_keep_disabled
+        self.gc_init_state = None
+    def fit_start(self, state: State, logger: Logger) -> None:
+        del state, logger  # unused
+        # cache if automatic garbage collection is enabled; reset at fit_end
+        self.gc_init_state = gc.isenabled()
+        # disable automatic garbage collection
+        gc.disable()
+        gc_cuda()
+    def fit_end(self, state: State, logger: Logger) -> None:
+        del state, logger  # unused
+        gc_cuda()
+        # reset automatic garbage collection at fit_end
+        if self.gc_init_state:
+            gc.enable()
+        else:
+            gc.disable()
+    def before_dataloader(self, state: State, logger: Logger) -> None:
+        del logger  # unused
+        if state.timestamp.batch.value % self.batch_interval == 0:
+            gc_cuda()
+    def eval_start(self, state: State, logger: Logger) -> None:
+        del state, logger  # unused
+        gc_cuda()
+        if not self.eval_keep_disabled:
+            gc.enable()
+    def eval_end(self, state: State, logger: Logger) -> None:
+        del state, logger  # unused
+        if not self.eval_keep_disabled:
+            gc.disable()
+        gc_cuda()

Perceptrix/finetune/build/lib/llmfoundry/data/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
+from llmfoundry.data.denoising import (MixtureOfDenoisersCollator,
+                                       build_text_denoising_dataloader)
+from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator,
+                                        build_finetuning_dataloader)
+from llmfoundry.data.text_data import (StreamingTextDataset,
+                                       build_text_dataloader)
+__all__ = [
+    'MixtureOfDenoisersCollator',
+    'build_text_denoising_dataloader',
+    'Seq2SeqFinetuningCollator',
+    'build_finetuning_dataloader',
+    'StreamingTextDataset',
+    'build_text_dataloader',
+    'NoConcatDataset',
+    'ConcatTokensDataset',
+]

Perceptrix/finetune/build/lib/llmfoundry/data/data.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Datasets for converting to MDS Shards."""
+import os
+import warnings
+from typing import Dict, Iterable, Union
+import datasets as hf_datasets
+import numpy as np
+from torch.utils.data import IterableDataset
+from transformers import PreTrainedTokenizerBase
+class NoConcatDataset(IterableDataset):
+    """An IterableDataset that returns text samples for MDSWriter.
+    Returns dicts of {'text': bytes}
+    """
+    def __init__(self, hf_dataset: Union[hf_datasets.IterableDataset,
+                                         hf_datasets.Dataset]):
+        self.hf_dataset = hf_dataset
+    def __iter__(self) -> Iterable[Dict[str, bytes]]:
+        for sample in self.hf_dataset:
+            # convert to bytes to store in MDS binary format
+            yield {'text': sample['text'].encode('utf-8')}
+class ConcatTokensDataset(IterableDataset):
+    """An IterableDataset that returns token samples for MDSWriter.
+    Returns dicts of {'tokens': bytes}
+    To use data created by this class and written to MDS format:
+    ```python
+        import torch
+        from streaming.base import StreamingDataset
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained('your/tokenizer')
+        ds = StreamingDataset(local='mds-data-folder', split='val')
+        # note, you need to copy the numpy array because the original is non-writeable
+        # and torch does not support non-writeable tensors, so you get a scary warning and
+        # if you do try to write to the tensor you get undefined behavior
+        tokens = torch.from_numpy(np.frombuffer(ds[0]['tokens'], dtype=np.int64).copy())
+        print(tokenizer.decode(tokens))
+    ```
+    """
+    def __init__(
+        self,
+        hf_dataset: Union[hf_datasets.IterableDataset, hf_datasets.Dataset],
+        tokenizer: PreTrainedTokenizerBase,
+        max_length: int,
+        bos_text: str,
+        eos_text: str,
+        no_wrap: bool,
+    ):
+        self.hf_dataset = hf_dataset
+        self.tokenizer = tokenizer
+        os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+        self.max_length = max_length
+        self.bos_text = bos_text
+        self.eos_text = eos_text
+        self.should_wrap = not no_wrap
+        self.bos_tokens = self.tokenizer(self.bos_text,
+                                         truncation=False,
+                                         padding=False,
+                                         add_special_tokens=False)['input_ids']
+        if len(self.bos_tokens) > 1:
+            warnings.warn(
+                f'You specified --concat_tokens with --bos_text, but your BOS text is not tokenizing to one token\
+                , instead we got {self.bos_tokens}. Quit if this was in error.')
+        self.eos_tokens = self.tokenizer(self.eos_text,
+                                         truncation=False,
+                                         padding=False,
+                                         add_special_tokens=False)['input_ids']
+        if len(self.eos_tokens) > 1:
+            warnings.warn(
+                f'You specified --concat_tokens with --eos_text, but your EOS text is not tokenizing to one token\
+                , instead we got {self.eos_tokens}. Quit if this was in error.')
+        eos_text_provided = self.eos_text != ''
+        bos_text_provided = self.bos_text != ''
+        test_text = self.tokenizer('')
+        if len(test_text['input_ids']) > 0 and (eos_text_provided or
+                                                bos_text_provided):
+            message = 'both eos and bos' if eos_text_provided and bos_text_provided else (
+                'eos_text' if eos_text_provided else 'bos_text')
+            warnings.warn(
+                f'The provided tokenizer adds special tokens, but you also specified {message}. This may result '
+                +
+                'in duplicated special tokens. Please be sure this is what you intend.'
+            )
+    def __iter__(self) -> Iterable[Dict[str, bytes]]:
+        buffer = []
+        for sample in self.hf_dataset:
+            encoded = self.tokenizer(sample['text'],
+                                     truncation=False,
+                                     padding=False)
+            iids = encoded['input_ids']
+            buffer = buffer + self.bos_tokens + iids + self.eos_tokens
+            while len(buffer) >= self.max_length:
+                concat_sample = buffer[:self.max_length]
+                buffer = buffer[self.max_length:] if self.should_wrap else []
+                yield {
+                    # convert to bytes to store in MDS binary format
+                    'tokens': np.asarray(concat_sample).tobytes()
+                }

Perceptrix/finetune/build/lib/llmfoundry/data/denoising.py ADDED Viewed

	@@ -0,0 +1,937 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Streaming dataloader for (mixture of) denoising task(s)."""
+import logging
+import random
+import sys
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Union
+import numpy as np
+import torch
+from omegaconf import DictConfig
+from omegaconf import OmegaConf as om
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
+from llmfoundry.data.packing import BinPackWrapper
+from llmfoundry.data.text_data import StreamingTextDataset
+from llmfoundry.models import utils
+__all__ = ['MixtureOfDenoisersCollator', 'build_text_denoising_dataloader']
+log = logging.getLogger(__name__)
+# HuggingFace hardcodes the ignore index to -100
+_HF_IGNORE_INDEX = -100
+# Required signature of any `prefix_function` (see below)
+PREFIX_FUNCTION = Callable[[float, Optional[float], PreTrainedTokenizerBase],
+                           Sequence[int]]
+def ul2_prefix_function(
+    mask_ratio: float,
+    mean_length: Optional[float],
+    tokenizer: PreTrainedTokenizerBase,
+) -> Sequence[int]:
+    """Generates prefixes based on UL2 paper.
+    See: http://arxiv.org/abs/2205.05131
+    """
+    if mean_length is None:
+        # This is the case for "sequence to sequence"
+        prefix = '[S2S]' if mask_ratio < 1.0 else '[CLM]'
+    elif mean_length >= 12 or mask_ratio >= 0.3:
+        # UL2 tags this corruption rate "extreme"
+        prefix = '[NLG]'
+    else:
+        # UL2 tags this corruption rate as "regular"
+        prefix = '[NLU]'
+    return tokenizer(prefix, add_special_tokens=False).input_ids
+class MixtureOfDenoisersCollator:
+    """Data collator for mixture of span-corruption denoisers, as in UL2.
+    This collator supports a variety of tasks used to pre-train an
+    encoder-decoder model or a (prefix LM) decoder-only model. This is meant
+    to be used with a dataset that yields tokenized text sequences. It is not
+    required that the token sequences are already padded or truncate, as this
+    collator will internally truncate and pad as needed.
+    For the denoising mixture recommended in the original UL2 paper,
+    http://arxiv.org/abs/2205.05131, use:
+    .. python:
+        MixtureOfDenoisersCollator(
+            ...,
+            span_mean_lengths_and_ratios=[
+                [3, .15],
+                [8, .15],
+                [3, .50],
+                [8, .50],
+                [64, .15],
+                [64, .50],
+            ],
+            sequence_mask_ratios=0.25
+        )
+    Args:
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
+            prepare the data from raw text. Any missing sentinel tokens will
+            be added by the collator.
+        max_seq_length (int): The maximum length of sequences produced by this
+            collator. Incoming sequences may be truncated to accommodate this
+            limit.
+            Note that when formatting for decoder-only models, the context
+            tokens and target tokens are concatenated, and max_seq_length
+            applies to their combined length. For encoder-decoder models, both
+            the encoder and decoder will see up to max_seq_length tokens.
+        decoder_only_format (bool, optional): Whether to format the batches
+            for a decoder-only model (i.e. a prefix LM) or, if ``False``, an
+            encoder-decoder model. Default: ``False``.
+        span_mean_lengths_and_rations (optional): A length-2 list of a
+            ``[mean_length, mask_ratio]`` pair, or a list of such pairs. Each
+            pair adds a span corruption denoising task to the task mixture. For
+            example, ``[3, 0.15]`` adds the original span corruption task used
+            for pre-training a T5 model as in http://arxiv.org/abs/1910.10683,
+            which trained with a single span corruption task that used a mean
+            span length of 3 and a mask ratio of 15%.
+            Default: ``None`` does not add any span corruption tasks.
+        sequence_mask_ratios (optional): A float or list of floats, one for each
+            sequence corruption denoising task to add to the task mixture. Each
+            sequence mask ratio must be greater than 0.0 and at most 1.0.
+            This type of task is a special instance of span corruption, with
+            exactly one masked span take from the end of the sequence. The
+            length of the span is sampled uniformly such that the average
+            portion of masked tokens equals sequence_mask_ratio.
+            Note: A value of 1.0 essentially yields causal LM examples.
+            Default: ``None` does not add any sequence corruption tasks.
+        allow_pad_trimming (bool, optional): Whether to allow the collator to
+            trim away sequence regions that are entirely padding (i.e. padding
+            for each example in the batch). If ``True``, shorter sequences may
+            improve throughput but at a potentially higher memory cost
+            owing to variable sequence lengths from batch to batch.
+            Default: ``False`` yields batches that are always padded to
+            max_seq_length.
+        prefix_function (callable, optional): A function that maps denoising
+            task parameters (e.g. mean_length=3, mask_ratio=0.15) to a prefix
+            that will be added to sequences when the associated "noiser" is
+            applied.
+            To disable these prefixes, use a value of ``None``.
+            Default: :func:`ul2_prefix_function` applies the prefix scheme
+            suggested in the UL2 paper: http://arxiv.org/abs/2205.05131.
+        context_eos (bool, optional): Whether to attach an EOS token to the end
+            of the context sequence, marking the transition from context to
+            target sequence. Only applicable if decoder_only_format is True.
+            Context EOS tokens are always added for encoder-decoder format.
+            Default: ``False`` does not attach context EOS.
+    """
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        max_seq_length: int,
+        decoder_only_format: bool = False,
+        span_mean_lengths_and_ratios: Optional[List] = None,
+        sequence_mask_ratios: Optional[Union[List[float], float]] = None,
+        allow_pad_trimming: bool = False,
+        prefix_function: Optional[PREFIX_FUNCTION] = ul2_prefix_function,
+        context_eos: Optional[bool] = None,
+    ):
+        # Prepare the tokenizer for denoising tasks
+        utils.adapt_tokenizer_for_denoising(tokenizer)
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        self.decoder_only_format = decoder_only_format
+        self._sentinel_token_ids = np.array(self.tokenizer.sentinel_token_ids)
+        # Trimming will always be skipped on at least the first __call__
+        self._allow_pad_trimming = allow_pad_trimming
+        self._seen_first_batch = False
+        self.context_eos = bool(context_eos) if decoder_only_format else True
+        # Process the span_mean_lengths_and_ratios argument
+        if span_mean_lengths_and_ratios is None:
+            # In this case, there are no span corruption tasks
+            self.span_mean_lengths_and_ratios = []
+        elif isinstance(span_mean_lengths_and_ratios[0], (int, float)):
+            # In this case, there is one span corruption task
+            if not len(span_mean_lengths_and_ratios) == 2:
+                raise ValueError('`span_mean_lengths_and_ratios` must be a ' + \
+                                 'pair of [mean_length, mask_ratio], a list ' + \
+                                 f'of such pairs, or None. Got {span_mean_lengths_and_ratios}.')
+            self.span_mean_lengths_and_ratios = [span_mean_lengths_and_ratios]
+        else:
+            # In this case, there are one or more span corruption tasks
+            span_mean_lengths_and_ratios = list(span_mean_lengths_and_ratios)
+            for spec_pair in span_mean_lengths_and_ratios:
+                if len(spec_pair) != 2:
+                    raise ValueError('`span_mean_lengths_and_ratios` must be a ' + \
+                                     'pair of [mean_length, mask_ratio], a list ' + \
+                                     f'of such pairs, or None. Got {span_mean_lengths_and_ratios}.')
+            self.span_mean_lengths_and_ratios = span_mean_lengths_and_ratios
+        # Process the sequence_mask_ratios argument
+        if sequence_mask_ratios is None:
+            # In this case, there are no sequence corruption tasks
+            self.sequence_mask_ratios = []
+        elif isinstance(sequence_mask_ratios, float):
+            # In this case, there is one sequence corruption task
+            self.sequence_mask_ratios = [sequence_mask_ratios]
+        else:
+            # In this case, there is one or more sequence corruption tasks
+            for ratio in sequence_mask_ratios:
+                if not (0 < ratio <= 1.0):
+                    raise ValueError('`sequence_mask_ratios` must be a float (or list '+\
+                                    'of floats) that are each >0.0 and <=1.0, or None. '+\
+                                    f'Got {sequence_mask_ratios}.')
+            self.sequence_mask_ratios = sequence_mask_ratios
+        # Populate the noisers so we can learn to denoise them!
+        self._noisers = []
+        self._smallest_max_raw_length = self.max_seq_length * 100
+        self._largest_max_raw_length = 0
+        self._uses_span_corruption = False
+        # Add "noisers" for any span corruption denoising tasks
+        # Each mean_length / mask_ratio combo becomes one of the span
+        # corruption denoising tasks
+        for span_mean_length, span_mask_ratio in self.span_mean_lengths_and_ratios:
+            self._uses_span_corruption = True
+            if span_mean_length < 0:
+                raise ValueError('All span mean lengths must be positive.')
+            if not 0 < span_mask_ratio < 1.0:
+                raise ValueError(
+                    'All span masking ratios must be between 0.0 and 1.0.')
+            if prefix_function is not None:
+                prefix_tokens = prefix_function(span_mask_ratio,
+                                                span_mean_length,
+                                                self.tokenizer)
+            else:
+                prefix_tokens = None
+            max_raw_length = _get_max_starting_length(
+                max_length=self.max_seq_length,
+                mask_ratio=span_mask_ratio,
+                mean_span_length=span_mean_length,
+                n_prefix_tokens=len(prefix_tokens or []),
+                decoder_only_format=self.decoder_only_format,
+                context_eos=self.context_eos)
+            if max_raw_length < self._smallest_max_raw_length:
+                self._smallest_max_raw_length = max_raw_length
+            if max_raw_length > self._largest_max_raw_length:
+                self._largest_max_raw_length = max_raw_length
+            kwargs = {
+                'mean_span_length': span_mean_length,
+                'mask_ratio': span_mask_ratio,
+                'prefix_tokens': prefix_tokens,
+                'max_raw_length': max_raw_length,
+            }
+            self._noisers.append(kwargs)
+        # Add "noisers" for any sequential denoising tasks
+        for sequence_mask_ratio in self.sequence_mask_ratios:
+            if prefix_function is not None:
+                prefix_tokens = prefix_function(sequence_mask_ratio, None,
+                                                self.tokenizer)
+            else:
+                prefix_tokens = None
+            max_raw_length = self.max_seq_length - len(prefix_tokens or []) - 1
+            if decoder_only_format and self.context_eos:
+                max_raw_length = max_raw_length - 1
+            if not self._uses_span_corruption and (
+                    max_raw_length < self._smallest_max_raw_length):
+                # We choose not to count sequence denoising in the smallest
+                # unless there is only sequence denoising.
+                self._smallest_max_raw_length = max_raw_length
+            if max_raw_length > self._largest_max_raw_length:
+                self._largest_max_raw_length = max_raw_length
+            kwargs = {
+                'mean_span_length': None,
+                'mask_ratio': sequence_mask_ratio,
+                'prefix_tokens': prefix_tokens,
+                'max_raw_length': max_raw_length,
+            }
+            self._noisers.append(kwargs)
+        if not self._noisers:
+            raise ValueError(
+                'No denoising tasks were included. Make sure to set ' + \
+                '`span_mean_lengths_and_ratios` and/or `sequence_mask_ratios`.')
+    @property
+    def smallest_max_raw_length(self) -> int:
+        return int(self._smallest_max_raw_length)
+    @property
+    def largest_max_raw_length(self) -> int:
+        return int(self._largest_max_raw_length)
+    def __call__(self, examples: List[Dict[str,
+                                           Any]]) -> Dict[str, torch.Tensor]:
+        """Batch examples processed by the span corrupter."""
+        processed_examples = []
+        for example in examples:
+            # Randomly pick a "noiser" to apply to this example
+            noiser = random.choice(self._noisers)
+            # Apply it
+            processed_examples.append(
+                noise_token_sequence(
+                    example,
+                    mask_ratio=noiser['mask_ratio'],
+                    mean_span_length=noiser['mean_span_length'],
+                    prefix_tokens=noiser['prefix_tokens'],
+                    max_raw_length=noiser['max_raw_length'],
+                    max_seq_length=self.max_seq_length,
+                    tokenizer=self.tokenizer,
+                    sentinel_token_ids=self._sentinel_token_ids,
+                    decoder_only_format=self.decoder_only_format,
+                    context_eos=self.context_eos))
+        batch = self.tokenizer.pad(processed_examples)
+        # This logic prevents trimming on at least the first batch
+        if not (self._allow_pad_trimming and self._seen_first_batch):
+            self._seen_first_batch = True
+            return batch
+        self._seen_first_batch = True
+        # Truncate portions of the inputs that are purely padding
+        # (up to a multiple of 8)
+        multiple_of = 8
+        n_examples_per_length = batch['attention_mask'].sum(0)
+        keep_tokens = torch.sum(n_examples_per_length > 0)
+        keep_tokens = int(multiple_of * torch.ceil(keep_tokens / multiple_of))
+        # Note: EncDec formatting will always produce a right-padded batch
+        if self.tokenizer.padding_side == 'left' and self.decoder_only_format:
+            batch['input_ids'] = batch['input_ids'][:, -keep_tokens:]
+            batch['attention_mask'] = batch['attention_mask'][:, -keep_tokens:]
+        else:
+            batch['input_ids'] = batch['input_ids'][:, :keep_tokens]
+            batch['attention_mask'] = batch['attention_mask'][:, :keep_tokens]
+        if self.decoder_only_format:
+            if self.tokenizer.padding_side == 'left':
+                batch['labels'] = batch['labels'][:, -keep_tokens:]
+                batch['bidirectional_mask'] = batch[
+                    'bidirectional_mask'][:, -keep_tokens:]
+            else:
+                batch['labels'] = batch['labels'][:, :keep_tokens]
+                batch['bidirectional_mask'] = batch[
+                    'bidirectional_mask'][:, :keep_tokens]
+        else:
+            # Truncate portions of the decoder inputs that are purely padding
+            n_examples_per_length = batch['decoder_attention_mask'].sum(0)
+            keep_tokens = torch.sum(n_examples_per_length > 0)
+            keep_tokens = int(multiple_of *
+                              torch.ceil(keep_tokens / multiple_of))
+            batch['labels'] = batch['labels'][:, :keep_tokens]
+            batch['decoder_attention_mask'] = batch[
+                'decoder_attention_mask'][:, :keep_tokens]
+            batch['decoder_input_ids'] = batch[
+                'decoder_input_ids'][:, :keep_tokens]
+        # This slicing can produce non-contiguous tensors, so use .contiguous
+        # to prevent related problems
+        batch = {k: v.contiguous() for k, v in batch.items()}
+        return batch
+def build_text_denoising_dataloader(
+    cfg: DictConfig,
+    tokenizer: PreTrainedTokenizerBase,
+    device_batch_size: int,
+) -> DataLoader[Dict]:
+    """Constructor function for a Mixture of Denoisers dataloader.
+    This function constructs a dataloader that can be used to train an
+    encoder-decoder model or a (prefix LM) decoder-only model on a text
+    denoising task mixture (e.g. span corruption, or UL2).
+    The underlying dataset is a :class:`StreamingTextDataset`, allowing you to
+    stream raw text data or pre-tokenized text data.
+    The dataloader uses a :class:`MixtureOfDenoisersCollator` to prepare the
+    tokenized examples into training batches.
+    Args:
+        cfg (DictConfig): An omegaconf dictionary used to configure the loader:
+            cfg.name (str): The type of dataloader to build. Must = "text_denoising".
+            ---
+            cfg.dataset.max_seq_len (int): The maximum length of sequences
+                in the batch. See :class:`MixtureOfDenoisersCollator` docstring
+                for details.
+            cfg.dataset.packing_ratio (float, optional): If provided, this invokes
+                a collator wrapper that packs device_batch_size*packing_ratio
+                raw examples into device_batch_size packed examples. This helps
+                minimize padding while preserving sequence integrity.
+                This adds `sequence_id` to the batch, which indicates which unique
+                sequence each token belongs to.
+                Note: Using this feature will not change device_batch_size but it
+                    will determine the number of raw examples consumed by the dataloader
+                    per batch. Some examples may be discarded if they do not fit when
+                    packing.
+                    Select packing_ratio **carefully** based on the dataset
+                    statistics, max_seq_len, and tolerance for discarding samples!
+                    The packing code in `./packing.py` provides a script that can help
+                    you choose the best packing_ratio.
+            See :class:`StreamingTextDataset` for info on other standard config
+                options within `cfg.dataset`.
+            ---
+            cfg.mixture_of_denoisers.decoder_only_format (bool): Whether the
+                batches should use the format required for training a decoder-only
+                model (if ``True``) or an encoder-decoder model (if ``False``).
+            cfg.mixture_of_denoisers.span_mean_lengths_and_ratios (optional): The
+                parameters for any span corruption denoising tasks to include in
+                the task mixture.
+                See :class:`MixtureOfDenoisersCollator` docstring for details.
+            cfg.mixture_of_denoisers.sequence_mask_ratios (optional): The
+                parameters for any sequence denoising tasks to include in the
+                task mixture.
+                See :class:`MixtureOfDenoisersCollator` docstring for details.
+            cfg.mixture_of_denoisers.allow_pad_trimming (optional): Whether to
+                allow the collator to trim padding when possible (if ``True``).
+                Defaults to ``False``.
+            cfg.mixture_of_denoisers.prefix_function (optional): Set to ``None``
+                to disable the UL2-style prefixes that will be automatically
+                added by default.
+            ---
+            See :class:`DataLoader` for standard argument options to the pytorch
+                dataloader, such as `cfg.drop_last`, `cfg.num_workers`, etc.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
+            prepare the data from raw text. Any missing sentinel tokens will
+            be added by the collator.
+        device_batch_size (int): The size of the batches (number of examples)
+            that the dataloader will produce.
+    Note:
+        You can run the script inside `./packing.py` to quickly test the
+        padding/waste rates for different `cfg.dataset.packing_ratio` choices,
+        given a starting workload YAML.
+    """
+    assert cfg.name == 'text_denoising', f'Tried to build_denoising text dataloader with cfg.name={cfg.name}'
+    collate_fn = MixtureOfDenoisersCollator(
+        tokenizer=tokenizer,
+        max_seq_length=cfg.dataset.max_seq_len,
+        decoder_only_format=cfg.mixture_of_denoisers.decoder_only_format,
+        span_mean_lengths_and_ratios=cfg.mixture_of_denoisers.get(
+            'span_mean_lengths_and_ratios'),
+        sequence_mask_ratios=cfg.mixture_of_denoisers.get(
+            'sequence_mask_ratios'),
+        allow_pad_trimming=cfg.mixture_of_denoisers.get('allow_pad_trimming',
+                                                        False),
+        prefix_function=cfg.mixture_of_denoisers.get('prefix_function',
+                                                     ul2_prefix_function),
+        context_eos=cfg.mixture_of_denoisers.get('context_eos'))
+    truncate_to = cfg.mixture_of_denoisers.get('truncate_raw_tokens_to')
+    if truncate_to is None:
+        # By default, truncate to the largest max raw length of the denoisers
+        truncate_to = collate_fn.largest_max_raw_length
+    elif isinstance(truncate_to, str):
+        if truncate_to.lower() == 'min':
+            # Truncate to the smallest max raw length of the denoisers
+            truncate_to = collate_fn.smallest_max_raw_length
+        elif truncate_to.lower() == 'max':
+            # Truncate to the largest max raw length of the denoisers
+            truncate_to = collate_fn.largest_max_raw_length
+        else:
+            raise ValueError(
+                f'truncate_raw_tokens_to(="{truncate_to.lower()}") must be "min", "max", a positive int, or None.'
+            )
+    else:
+        if not isinstance(truncate_to, int):
+            ValueError(
+                f'truncate_raw_tokens_to(={truncate_to}) must be "min", "max", a positive int, or None.'
+            )
+        if truncate_to < 0:
+            ValueError(
+                f'truncate_raw_tokens_to(={truncate_to}) must be "min", "max", a positive int, or None.'
+            )
+    dataset = StreamingTextDataset(
+        local=cfg.dataset.local,
+        tokenizer=tokenizer,
+        max_seq_len=truncate_to,
+        remote=cfg.dataset.get('remote'),
+        split=cfg.dataset.get('split'),
+        shuffle=cfg.dataset.get('shuffle', False),
+        predownload=cfg.dataset.get('predownload', 100_000),
+        keep_zip=cfg.dataset.get('keep_zip', False),
+        download_retry=cfg.dataset.get('download_retry', 2),
+        download_timeout=cfg.dataset.get('download_timeout', 60),
+        validate_hash=cfg.dataset.get('validate_hash'),
+        shuffle_seed=cfg.dataset.get('shuffle_seed', 9176),
+        num_canonical_nodes=cfg.dataset.get('num_canonical_nodes', 128),
+        batch_size=device_batch_size,
+    )
+    if dataset.tokenizer.pad_token is None:
+        dataset.tokenizer.pad_token = dataset.tokenizer.eos_token
+    if cfg.dataset.get('packing_ratio'):
+        n_examples_to_pack = int(device_batch_size * cfg.dataset.packing_ratio)
+        if n_examples_to_pack < device_batch_size:
+            raise ValueError('packing_ratio must be >= 1, if supplied')
+        if not cfg.mixture_of_denoisers.decoder_only_format:
+            raise NotImplementedError(
+                'On-the-fly packing is currently only supported for decoder-only formats.'
+            )
+        collate_fn = BinPackWrapper(
+            collator=collate_fn,
+            target_batch_size=device_batch_size,
+            max_seq_len=cfg.dataset.max_seq_len,
+            pad_token_id=dataset.tokenizer.pad_token_id,
+            padding_side=dataset.tokenizer.padding_side,
+            max_leftover_bins_to_keep=cfg.dataset.get(
+                'max_leftover_bins_to_keep'),
+        )
+        device_batch_size = n_examples_to_pack
+    elif cfg.dataset.get('max_leftover_bins_to_keep') is not None:
+        raise ValueError(
+            'cfg.dataset.max_leftover_bins_to_keep has been defined, ' +\
+            'but cfg.dataset.packing_ratio has not been set. Please set ' +\
+            'the latter to turn on packing or remove the former from the config.')
+    return DataLoader(
+        dataset,
+        collate_fn=collate_fn,
+        batch_size=device_batch_size,
+        drop_last=cfg.drop_last,
+        num_workers=cfg.num_workers,
+        pin_memory=cfg.get('pin_memory', True),
+        prefetch_factor=cfg.get('prefetch_factor', 2),
+        persistent_workers=cfg.get('persistent_workers', False),
+        timeout=cfg.get('timeout', 0),
+    )
+def noise_token_sequence(
+    example: Union[torch.Tensor, Mapping[str, Any]],
+    mask_ratio: float,
+    mean_span_length: Optional[float],
+    prefix_tokens: Optional[Sequence[int]],
+    max_raw_length: int,
+    max_seq_length: int,
+    tokenizer: PreTrainedTokenizerBase,
+    sentinel_token_ids: np.ndarray,
+    decoder_only_format: bool,
+    context_eos: bool,
+) -> Dict[str, torch.Tensor]:
+    """Span corruption applicable to all UL2 denoising tasks."""
+    # Extract the raw text tokens (trim if we need to)
+    if isinstance(example, torch.Tensor):
+        # If the example is a tensor, assume is the raw tokens with no padding
+        tokens = example
+        length = len(tokens)
+    else:
+        tokens = example['input_ids']
+        length = sum(example['attention_mask'])
+    if length > max_raw_length:
+        length = max_raw_length
+    if tokenizer.padding_side == 'left':
+        tokens = tokens[-length:]
+    else:
+        tokens = tokens[:length]
+    prefix_tokens = prefix_tokens or []
+    if length < 1:
+        raise ValueError('Example cannot be empty but token length <1.')
+    # mean_span_length==None is a special case for "sequential" denoising
+    # (where a single span at the end of the sequence is masked)
+    if mean_span_length is None:
+        # This ensures that exactly 1 span will be produced and that
+        # trimming to max_seq_length won't cut off any <EOS> token.
+        # In the decoder-only case, this won't insert new tokens.
+        if mask_ratio <= 0.5:
+            u = np.random.uniform(low=0.0, high=mask_ratio * 2)
+        else:
+            u = np.random.uniform(low=(mask_ratio * 2) - 1, high=1.0)
+        mean_span_length = float(np.round(1 + u * (length - 1)))
+        mask_ratio = mean_span_length / length
+        use_sentinels = False
+    else:
+        use_sentinels = True
+    # Generate the mask
+    # Note: this function can be used for all the UL2 noising functions
+    mask = _sample_mask_array(length, mask_ratio, mean_span_length)
+    # The sequence should always be unmasked at the beginning
+    assert mask[0] == 0
+    # Generate the input/label sequences given the raw tokens and the mask
+    tokens_inputs = _apply_mask(tokens,
+                                mask,
+                                use_sentinels,
+                                tokenizer.eos_token_id,
+                                sentinel_token_ids,
+                                ensure_eos=context_eos)
+    tokens_labels = _apply_mask(tokens,
+                                1 - mask,
+                                use_sentinels,
+                                tokenizer.eos_token_id,
+                                sentinel_token_ids,
+                                ensure_eos=True)
+    # Tag the inputs with any prefix
+    if prefix_tokens:
+        tokens_inputs = np.concatenate([prefix_tokens, tokens_inputs])
+    # Trim if necessary
+    if len(tokens_inputs) > max_seq_length:
+        raise ValueError('This should not exceed the max length')
+    if len(tokens_labels) > max_seq_length:
+        raise ValueError('This should not exceed the max length')
+    tokens_inputs = torch.LongTensor(tokens_inputs)
+    tokens_labels = torch.LongTensor(tokens_labels)
+    if decoder_only_format:
+        return _format_tokens_for_decoder_only(tokens_inputs, tokens_labels,
+                                               max_seq_length,
+                                               tokenizer.pad_token_id,
+                                               tokenizer.padding_side)
+    return _format_tokens_for_encoder_decoder(tokens_inputs, tokens_labels,
+                                              max_seq_length,
+                                              tokenizer.pad_token_id)
+def _get_max_starting_length(max_length: int, mask_ratio: float,
+                             mean_span_length: float, n_prefix_tokens: int,
+                             decoder_only_format: bool,
+                             context_eos: bool) -> int:
+    """Get max num raw tokens that will fit max_length."""
+    def sequence_stats(length: int):
+        length = np.maximum(length, 2)
+        num_noise_tokens = int(np.round(mask_ratio * float(length)))
+        num_noise_tokens = np.minimum(np.maximum(num_noise_tokens, 1),
+                                      length - 1)
+        num_spans = int(np.round(float(num_noise_tokens) / mean_span_length))
+        num_noise_spans = np.maximum(num_spans, 1)
+        num_nonnoise_tokens = length - num_noise_tokens
+        # Prefix, sentinel, and EOS added to input for Enc-Dec
+        extra_inp_tokens = n_prefix_tokens + num_noise_spans + int(context_eos)
+        # Sentinel and EOS added to target
+        extra_targ_tokens = num_noise_spans + 1
+        # Sequence totals after corruption
+        total_inp_tokens = num_nonnoise_tokens + extra_inp_tokens
+        total_targ_tokens = num_noise_tokens + extra_targ_tokens
+        return total_inp_tokens, total_targ_tokens
+    def length_fits(length: int) -> bool:
+        total_inp_tokens, total_targ_tokens = sequence_stats(length)
+        if decoder_only_format:
+            return (total_inp_tokens + total_targ_tokens) <= max_length
+        return (total_inp_tokens <= max_length) and (total_targ_tokens <=
+                                                     max_length)
+    # Start with a definitely too-long sequence and reduce until it fits
+    num_raw_tokens = max_length * 2
+    while num_raw_tokens > 0:
+        if length_fits(num_raw_tokens):
+            return num_raw_tokens
+        num_raw_tokens -= 1
+    raise ValueError(
+        'Unable to find a starting sequence length that can fit given the corruption and max_length parameters.'
+    )
+def _sample_mask_array(length: int, mask_ratio: float,
+                       mean_span_length: float) -> np.ndarray:
+    """Samples a span corruption mask."""
+    if mask_ratio == 0.0:
+        return np.zeros(length)
+    # This first block computes the number of noise/non-noise spans and the
+    # total tokens in each. Extra steps are taken to handle edge cases that
+    # cause degeneracy.
+    starting_length = length
+    length = np.maximum(length, 2)
+    num_noise_tokens = int(np.round(mask_ratio * float(length)))
+    num_noise_tokens = np.minimum(np.maximum(num_noise_tokens, 1), length - 1)
+    num_spans = int(np.round(float(num_noise_tokens) / mean_span_length))
+    num_noise_spans = np.maximum(num_spans, 1)
+    num_nonnoise_tokens = length - num_noise_tokens
+    # Sample the noise/non-noise span lengths and interleave them to
+    # generate the mask array.
+    # Note: We always start with a non-noise span.
+    def _sample_span_lengths(total_tokens: int, num_spans: int) -> np.ndarray:
+        """Samples lengths of num_spans segments.
+        Note: the combined length of segments equals total_tokens.
+        """
+        span_markers = np.less(np.arange(total_tokens - 1), num_spans -
+                               1)[np.random.permutation(total_tokens - 1)]
+        span_start_indicator = np.concatenate([np.array([0]), span_markers])
+        span_id = np.cumsum(span_start_indicator).reshape(-1, 1)
+        spans = np.arange(num_spans).reshape(1, -1)
+        span_lengths = np.sum(span_id == spans, axis=0)
+        return span_lengths
+    noise_span_lengths = _sample_span_lengths(num_noise_tokens, num_noise_spans)
+    nonnoise_span_lengths = _sample_span_lengths(num_nonnoise_tokens,
+                                                 num_noise_spans)
+    interleaved_span_lengths = np.reshape(
+        np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1),
+        [num_noise_spans * 2])
+    span_starts = np.cumsum(interleaved_span_lengths)[:-1]
+    span_start_indicator = np.zeros(length)
+    span_start_indicator[span_starts] = 1
+    span_id = np.cumsum(span_start_indicator)
+    is_noise = np.equal(np.mod(span_id, 2), 1)
+    mask = is_noise[:starting_length]
+    return mask
+def _apply_mask(tokens: Union[torch.Tensor, Sequence[int], np.ndarray],
+                mask: np.ndarray,
+                use_sentinels: bool,
+                eos_token_id: int,
+                sentinel_token_ids: np.ndarray,
+                ensure_eos: bool = True) -> np.ndarray:
+    """Remove or replace masked portions from token sequence."""
+    if not use_sentinels:
+        # The logic is simple if we don't use sentinel tokens
+        noised_tokens = np.array(tokens)[np.logical_not(mask)]
+        # Ensure there's an end-of-sentence token at the end
+        if ensure_eos and (noised_tokens[-1] != eos_token_id):
+            noised_tokens = np.concatenate(
+                [noised_tokens, np.array([eos_token_id])])
+        return noised_tokens
+    # Masking at previous token
+    prev_token_mask = np.concatenate([np.array([0]), mask[:-1]])
+    # Decompose mask into start-of-span mask and non-start-of-span mask
+    start_of_noise_span_token = np.logical_and(mask,
+                                               np.logical_not(prev_token_mask))
+    nonstart_noise_span_token = np.logical_and(mask, prev_token_mask)
+    # Replace tokens at the start of each noise span with its corresponding
+    # sentinel token
+    sentinel_idx = np.minimum(len(sentinel_token_ids),
+                              np.cumsum(start_of_noise_span_token)) - 1
+    tokens = np.where(start_of_noise_span_token,
+                      sentinel_token_ids[sentinel_idx], tokens)
+    # Remove masked tokens (but preserving the sentinel tokens)
+    noised_tokens = tokens[np.logical_not(nonstart_noise_span_token)]
+    # Ensure there's an end-of-sentence token at the end
+    if ensure_eos and (noised_tokens[-1] != eos_token_id):
+        noised_tokens = np.concatenate(
+            [noised_tokens, np.array([eos_token_id])])
+    return noised_tokens
+def _format_tokens_for_encoder_decoder(
+    tokens_inputs: torch.LongTensor,
+    tokens_labels: torch.LongTensor,
+    max_seq_length: int,
+    pad_token_id: int,
+) -> Dict[str, torch.Tensor]:
+    """Package the input/label sequence for an EncDec model."""
+    example = {}
+    # Re-populate with an empty, padded example
+    example['input_ids'] = torch.full((max_seq_length,),
+                                      pad_token_id,
+                                      dtype=torch.int32)
+    example['labels'] = torch.full((max_seq_length,),
+                                   _HF_IGNORE_INDEX,
+                                   dtype=torch.int32)
+    example['attention_mask'] = torch.zeros_like(example['input_ids'])
+    example['decoder_attention_mask'] = torch.zeros_like(example['labels'])
+    # Fill in with processed results (Note: EncDec format is right-padded)
+    example['input_ids'][:len(tokens_inputs)] = tokens_inputs
+    example['labels'][:len(tokens_labels)] = tokens_labels
+    example['attention_mask'][:len(tokens_inputs)] = 1
+    example['decoder_attention_mask'][:len(tokens_labels)] = 1
+    # Best practice is to include decoder_input_ids (= right-shifted labels)
+    example['decoder_input_ids'] = torch.full_like(example['labels'],
+                                                   pad_token_id)
+    example['decoder_input_ids'][1:len(tokens_labels)] = tokens_labels[:-1]
+    return example
+def _format_tokens_for_decoder_only(
+    tokens_inputs: torch.LongTensor,
+    tokens_labels: torch.LongTensor,
+    max_seq_length: int,
+    pad_token_id: int,
+    padding_side: str,
+) -> Dict[str, torch.Tensor]:
+    """Package the input/label sequence for an decoder-only model."""
+    example = {}
+    # Re-populate with an empty, padded example
+    example['input_ids'] = torch.full((max_seq_length,),
+                                      pad_token_id,
+                                      dtype=torch.int32)
+    example['labels'] = torch.full((max_seq_length,),
+                                   _HF_IGNORE_INDEX,
+                                   dtype=torch.int32)
+    example['attention_mask'] = torch.full((max_seq_length,),
+                                           0,
+                                           dtype=torch.bool)
+    example['bidirectional_mask'] = torch.full((max_seq_length,),
+                                               0,
+                                               dtype=torch.bool)
+    n_input = len(tokens_inputs)
+    n_label = len(tokens_labels)
+    n_concat = n_input + n_label
+    assert n_concat <= max_seq_length, f'{n_concat=}, {n_input=}, {n_label=}'
+    tokens_concat = torch.concat([tokens_inputs, tokens_labels], dim=0)
+    # Fill in with the processed results
+    if padding_side == 'left':
+        example['input_ids'][-n_concat:] = tokens_concat
+        # `labels` copies `input_ids` but with -100 at
+        # non-loss-generating tokens. `labels` will be shifted in the
+        # model code when computing loss.
+        example['labels'][-n_concat:] = tokens_concat
+        example['labels'][-n_concat:-n_label] = _HF_IGNORE_INDEX
+        example['attention_mask'][-n_concat:] = 1
+        example['bidirectional_mask'][-n_concat:-n_label] = 1
+    else:
+        example['input_ids'][:n_concat] = tokens_concat
+        # See above comment regarding `labels`
+        example['labels'][:n_concat] = tokens_concat
+        example['labels'][:n_input] = _HF_IGNORE_INDEX
+        example['attention_mask'][:n_concat] = 1
+        example['bidirectional_mask'][:n_input] = 1
+    return example
+# Helpful to test if your dataloader is working locally
+# Run `python denoising.py [local] [remote, optional]` and verify that batches
+# are printed out
+if __name__ == '__main__':
+    from llmfoundry.utils.builders import build_tokenizer
+    local = sys.argv[1]
+    if len(sys.argv) > 2:
+        remote = sys.argv[2]
+    else:
+        remote = local
+    print(f'Reading val split from {remote} -> {local}')
+    decoder_only = True
+    cfg = {
+        'name': 'text_denoising',
+        'dataset': {
+            'local': local,
+            'remote': remote,
+            'split': 'val',  # 'val_small',
+            'shuffle': False,
+            'max_seq_len': 2048 if decoder_only else 1024,
+            'packing_ratio': 4.5,
+            'predownload': 1000,
+            'keep_zip': True,  # in case we need compressed files after testing
+        },
+        'mixture_of_denoisers': {
+            'decoder_only_format': decoder_only,
+            'span_mean_lengths_and_ratios': [[3, .15], [8, .5]],
+            'sequence_mask_ratios': 0.25,
+        },
+        'drop_last': False,
+        'num_workers': 0,
+    }
+    cfg = om.create(cfg)
+    device_batch_size = 2
+    tokenizer_name = 'EleutherAI/gpt-neox-20b' if decoder_only else 't5-base'
+    tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len}
+    tokenizer = build_tokenizer(tokenizer_name=tokenizer_name,
+                                tokenizer_kwargs=tokenizer_kwargs)
+    loader = build_text_denoising_dataloader(cfg, tokenizer, device_batch_size)
+    assert isinstance(loader.dataset, StreamingTextDataset)
+    print(f'\n\nTRUNCATING TO: {loader.dataset.max_seq_len}\n\n')
+    packing = cfg.dataset.get('packing_ratio') is not None
+    if packing:
+        tokenizer = loader.collate_fn.base_collator.tokenizer
+    else:
+        tokenizer = loader.collate_fn.tokenizer
+    batch_ix = 0
+    for batch in loader:
+        if batch_ix >= 50:
+            batch_ix += 1
+            break
+        if batch_ix >= 5:
+            if not packing:
+                break
+            batch_ix += 1
+            continue
+        print('\n')
+        print('#' * 20, f'Batch {batch_ix}', '#' * 20)
+        for k, v in batch.items():
+            print(k, v.shape, v.dtype)
+        for sample_ix, token_sample in enumerate(batch['input_ids']):
+            if cfg.mixture_of_denoisers.decoder_only_format:
+                labels = batch['labels'][sample_ix]
+                attn_inputs = batch['bidirectional_mask'][sample_ix].to(
+                    torch.bool)
+                attn_full = batch['attention_mask'][sample_ix].to(torch.bool)
+                attn_labels = torch.logical_xor(attn_inputs, attn_full)
+                print('-' * 20, f' Sample {sample_ix} ', '-' * 20)
+                if packing:
+                    for subseq in range(
+                            int(batch['sequence_id'][sample_ix].max()) + 1):
+                        is_subseq = batch['sequence_id'][sample_ix] == subseq
+                        print(
+                            '\033[93m{}\033[00m\n'.format('Input:  '),
+                            tokenizer.decode(token_sample[torch.logical_and(
+                                is_subseq, attn_inputs)]))
+                        print(
+                            '\033[92m{}\033[00m\n'.format('Target: '),
+                            tokenizer.decode(labels[torch.logical_and(
+                                is_subseq, attn_labels)]))
+                else:
+                    print('\033[91m{}\033[00m\n'.format('Full:   '),
+                          tokenizer.decode(token_sample[attn_full]))
+                    print('\033[93m{}\033[00m\n'.format('Input:  '),
+                          tokenizer.decode(token_sample[attn_inputs]))
+                    print('\033[92m{}\033[00m\n'.format('Target: '),
+                          tokenizer.decode(labels[attn_labels]))
+            else:
+                labels = batch['labels'][sample_ix]
+                attn_inputs = batch['attention_mask'][sample_ix].to(torch.bool)
+                attn_labels = batch['decoder_attention_mask'][sample_ix].to(
+                    torch.bool)
+                print('-' * 20, f' Sample {sample_ix} ', '-' * 20)
+                print('\033[93m{}\033[00m\n'.format('Input:  '),
+                      tokenizer.decode(token_sample[attn_inputs]))
+                print('\033[92m{}\033[00m\n'.format('Target: '),
+                      tokenizer.decode(labels[attn_labels]))
+        batch_ix += 1
+    if packing:
+        print(f'Padding = {100*(1-loader.collate_fn.efficiency):5.2f}%')
+        print(f'Waste   = {100*loader.collate_fn.waste:5.2f}%')

Perceptrix/finetune/build/lib/llmfoundry/data/finetuning/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator
+from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+__all__ = ['Seq2SeqFinetuningCollator', 'build_finetuning_dataloader']

Perceptrix/finetune/build/lib/llmfoundry/data/finetuning/collator.py ADDED Viewed

	@@ -0,0 +1,343 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import warnings
+from typing import Any, Dict, List, Optional, Union
+import torch
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+log = logging.getLogger(__name__)
+# HuggingFace hardcodes the ignore index to -100
+_HF_IGNORE_INDEX = -100
+class Seq2SeqFinetuningCollator:
+    """A general-purpose collator for sequence-to-sequence training/evaluation.
+    Args:
+        tokenizer: A HuggingFace tokenizer. Must have a pad_token set.
+        max_seq_len (int): The maximum sequence length of the combined
+            context/target sequence (decoder-only format) or of each the
+            context sequence and target sequence (encoder-decoder format).
+        decoder_only_format (bool): Whether to format the batches for a
+            decoder-only model (if True) or an encoder-decoder model (if False).
+        allow_pad_trimming (bool, optional): Whether to allow the collator
+            to trim padding, which may result in smaller but inconsistent batch
+            sizes. Default: ``False`` ensures that all sequences are max_seq_len.
+        separator_text (str | bool, optional): If a string is provided, it will
+            be used to separate the context and target sequences (appended to end
+            of context). If ``True``, will use the tokenizer's sep_token, which must
+            be defined. Only applicable for decoder-only formatting.
+        format_for_generation (bool, optional): Whether to format the batch such
+            that context and target sequences remain separated, which is useful
+            when using the context to generate text which should be compared to the
+            target (e.g., during evaluation). Default: ``False``.
+        batch_metadata (dict, optional): A dictionary of metadata which will be added
+            to the batch.
+    """
+    def __init__(
+        self,
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        max_seq_len: int,
+        decoder_only_format: bool,
+        allow_pad_trimming: bool = False,
+        separator_text: Optional[Union[str, bool]] = None,
+        format_for_generation: bool = False,
+        batch_metadata: Optional[Dict[str, Any]] = None,
+    ):
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+        self.decoder_only_format = decoder_only_format
+        self.format_for_generation = format_for_generation
+        self.batch_metadata = batch_metadata or {}
+        # Trimming will always be skipped on at least the first __call__
+        self._allow_pad_trimming = allow_pad_trimming
+        self._seen_first_batch = False
+        illegal_keys = [
+            'input_ids', 'labels', 'attention_mask', 'decoder_input_ids',
+            'decoder_attention_mask', 'generate_output'
+        ]
+        found_keys = []
+        for illegal_key in illegal_keys:
+            if illegal_key in self.batch_metadata:
+                found_keys.append(illegal_key)
+        if found_keys:
+            raise ValueError(
+                f'The following keys are in batch_metadata but are not allowed: {", ".join(found_keys)}.\n' +\
+                f'You cannot use keys that are used directly by the models. The prohibited keys are:\n' +\
+                f'{", ".join(illegal_keys)}'
+            )
+        if self.format_for_generation:
+            self.batch_metadata['generate_output'] = True
+        if (max_seq_len % 8) != 0:
+            log.warning(
+                'For performance, a max_seq_len as a multiple of 8 is recommended.'
+            )
+        if self.tokenizer.pad_token_id is None:
+            raise ValueError(
+                f'{self.__class__.__name__} requires that the tokenizer has the pad token set, but it is None'
+            )
+        self.separator_tokens = []
+        if separator_text and decoder_only_format:
+            if separator_text == True:
+                # Use the tokenizer's sep token or throw an error if undefined
+                if self.tokenizer.sep_token_id is None:
+                    raise ValueError(
+                        'Setting separator_text=True requires that the tokenizer has sep_token_id but it has not been set. ' +\
+                        'Please pass a string argument for separator_text or set sep_token_id in the tokenizer.'
+                    )
+                self.separator_tokens = [self.tokenizer.sep_token_id]
+            else:
+                # Convert the string separator_text into token(s)
+                self.separator_tokens = tokenizer(
+                    separator_text, add_special_tokens=False).input_ids
+        self._warned_context = False
+        self._warned_target = False
+    def __call__(self, examples: List[Dict[str,
+                                           Any]]) -> Dict[str, torch.Tensor]:
+        for check_key in ['input_ids', 'labels', 'attention_mask']:
+            if check_key not in examples[0]:
+                raise KeyError(
+                    f'Examples returned by dataset do not include required key: {check_key}'
+                )
+        if self.decoder_only_format:
+            batch = self._process_and_batch_decoder_only(examples)
+        else:
+            batch = self._process_and_batch_encoder_decoder(examples)
+        # Add any batch_metadata
+        batch_size = batch['input_ids'].shape[0]
+        batch.update({
+            k: torch.tensor([v] * batch_size)
+            for k, v in self.batch_metadata.items()
+        })
+        return batch
+    def _process_and_batch_decoder_only(
+            self, examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        # Steps explained in comments
+        processed_examples = []
+        for example in examples:
+            context = ensure_list(example['input_ids'])
+            target = ensure_list(example['labels'])
+            # First, get rid of any padding tokens
+            context = [t for t in context if t != self.tokenizer.pad_token_id]
+            target = [t for t in target if t != self.tokenizer.pad_token_id]
+            # Second, append any separator tokens to the context tokens
+            if self.separator_tokens:
+                context = context + self.separator_tokens
+            # Third, ensure that the target text ends with an eos tag
+            if target[-1] != self.tokenizer.eos_token_id:
+                target = target + [self.tokenizer.eos_token_id]
+            n_context = len(context)
+            n_target = len(target)
+            if n_context >= self.max_seq_len:
+                if not self._warned_context:
+                    warnings.warn(
+                        f'Skipping example because CONTEXT length={n_context} leaves no room ' +\
+                        f'for TARGET tokens because max_seq_len={self.max_seq_len}. ' +\
+                        f'If this causes downstream issues because of inconsistent batch sizes, ' +\
+                        f'consider increasing max_seq_len or using example packing.'
+                    )
+                    self._warned_context = True
+                continue
+            if self.format_for_generation:
+                # When formatting for generation, we need to keep input_ids and
+                # labels separate. The input_ids (context) will be fed into the
+                # generator and the labels will be used by the eval metric.
+                input_ids = context[-self.max_seq_len:]
+                n_context = len(input_ids)
+                attention_mask = [1] * n_context
+                bidirectional_mask = [1] * n_context
+                # Annoyingly, we need to pad the everything but input_ids
+                # and attention_mask ourselves
+                i_pad = [self.tokenizer.pad_token_id
+                        ] * (self.max_seq_len - n_target)
+                z_pad = [0] * (self.max_seq_len - n_context)
+                if self.tokenizer.padding_side == 'left':
+                    labels = i_pad + target
+                    bidirectional_mask = z_pad + bidirectional_mask
+                else:
+                    labels = target + i_pad
+                    bidirectional_mask = bidirectional_mask + z_pad
+            else:
+                # We need to concatenate the context and target to get the
+                # full input sequence, cutting off any excess tokens from the
+                # end of the target
+                if n_context + n_target > self.max_seq_len:
+                    old_n_target = int(n_target)
+                    n_target = self.max_seq_len - n_context
+                    if not self._warned_target:
+                        warnings.warn(
+                            f'Truncating TARGET sequence of length={old_n_target} to length={n_target}, ' +\
+                            f'so context+target fit max_seq_len={self.max_seq_len}. If truncation is ' +\
+                            f'a problem, consider increasing max_seq_len.')
+                        self._warned_target = True
+                    target = target[-n_target:]
+                    target[-1] = self.tokenizer.eos_token_id
+                n_total = n_context + n_target
+                input_ids = context + target
+                labels = ([_HF_IGNORE_INDEX] * n_context) + target
+                attention_mask = [1] * n_total
+                # bidirectional_mask is used by our prefix lm model variants
+                bidirectional_mask = ([1] * n_context) + ([0] * n_target)
+                # Annoyingly, we need to pad the everything but input_ids
+                # and attention_mask ourselves
+                i_pad = [_HF_IGNORE_INDEX] * (self.max_seq_len - n_total)
+                z_pad = [0] * (self.max_seq_len - n_total)
+                if self.tokenizer.padding_side == 'left':
+                    labels = i_pad + labels
+                    bidirectional_mask = z_pad + bidirectional_mask
+                else:
+                    labels = labels + i_pad
+                    bidirectional_mask = bidirectional_mask + z_pad
+            # Update the example
+            example['input_ids'] = input_ids
+            example['labels'] = labels
+            example['attention_mask'] = attention_mask
+            example['bidirectional_mask'] = bidirectional_mask
+            processed_examples.append(example)
+        batch = self.tokenizer.pad(
+            processed_examples,
+            padding='max_length',
+            max_length=self.max_seq_len,
+            return_tensors='pt',
+        )
+        # This logic prevents trimming on at least the first batch
+        if not (self._allow_pad_trimming and self._seen_first_batch):
+            self._seen_first_batch = True
+            return batch
+        self._seen_first_batch = True
+        # The batch is ready, but we can trim padding for efficiency
+        multiple_of = 8
+        n_non_padding = batch['attention_mask'].sum(dim=1).max()
+        keep_tokens = int(multiple_of * torch.ceil(n_non_padding / multiple_of))
+        for k, v in batch.items():
+            if len(v.shape) < 2:
+                continue
+            if k == 'labels' and self.format_for_generation:
+                continue
+            if self.tokenizer.padding_side == 'left':
+                batch[k] = v[:, -keep_tokens:].contiguous()
+            else:
+                batch[k] = v[:, :keep_tokens].contiguous()
+        return batch
+    def _process_and_batch_encoder_decoder(
+            self, examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        # The encoder-decoder case is has some gotchas.
+        # Steps are explained in comments.
+        processed_examples = []
+        for example in examples:
+            context = ensure_list(example['input_ids'])
+            target = ensure_list(example['labels'])
+            # ... first, get rid of any padding that was already applied
+            context = [t for t in context if t != self.tokenizer.pad_token_id]
+            target = [t for t in target if t != self.tokenizer.pad_token_id]
+            # ... second, ensure that the target text ends with an eos tag
+            if target[-1] != self.tokenizer.eos_token_id:
+                target = target + [self.tokenizer.eos_token_id]
+            # ... third, we need to pad labels ourselves. Because HF.
+            if len(target) < self.max_seq_len:
+                i_pad = [_HF_IGNORE_INDEX] * (self.max_seq_len - len(target))
+                target = target + i_pad
+            else:
+                if not self._warned_target:
+                    warnings.warn(
+                        f'Truncating TARGET sequence of length={len(target)} ' +\
+                        f'to max_seq_len={self.max_seq_len}. If truncation is ' +\
+                        f'a problem, consider increasing max_seq_len.')
+                    self._warned_target = True
+                target = target[:self.max_seq_len -
+                                1] + [self.tokenizer.eos_token_id]
+            # We might need to truncate the context. Preserve the beginning.
+            if len(context) > self.max_seq_len:
+                if not self._warned_context:
+                    warnings.warn(
+                        f'Truncating CONTEXT sequence of length={len(context)} ' +\
+                        f'to max_seq_len={self.max_seq_len}. If truncation is ' +\
+                        f'a problem, consider increasing max_seq_len.')
+                    self._warned_context = True
+                context = context[:self.max_seq_len -
+                                  1] + [self.tokenizer.eos_token_id]
+            # Back into the example
+            example['input_ids'] = context
+            example['attention_mask'] = [1] * len(context)
+            example['labels'] = target
+            processed_examples.append(example)
+        # Batch examples into a single dict (this also pads)
+        batch = self.tokenizer.pad(
+            processed_examples,
+            padding='max_length',
+            max_length=self.max_seq_len,
+            return_tensors='pt',
+        )
+        # We're still missing decoder_input_ids and decoder_attention_mask
+        batch['decoder_input_ids'] = torch.cat([
+            torch.full((len(processed_examples), 1),
+                       self.tokenizer.pad_token_id), batch['labels'][:, :-1]
+        ],
+                                               dim=1)
+        batch['decoder_input_ids'].masked_fill_(
+            batch['decoder_input_ids'] == _HF_IGNORE_INDEX,
+            self.tokenizer.pad_token_id)
+        batch['decoder_attention_mask'] = torch.not_equal(
+            batch['labels'], _HF_IGNORE_INDEX)
+        # This logic prevents trimming on at least the first batch
+        if not (self._allow_pad_trimming and self._seen_first_batch):
+            self._seen_first_batch = True
+            return batch
+        self._seen_first_batch = True
+        # The batch is now valid, but we can trim padding for efficiency
+        multiple_of = 8
+        # (first for the encoder)
+        n_non_padding = batch['attention_mask'].sum(dim=1).max()
+        keep_tokens = int(multiple_of * torch.ceil(n_non_padding / multiple_of))
+        for k in ['input_ids', 'attention_mask']:
+            batch[k] = batch[k][:, :keep_tokens].contiguous()
+        # (then for the decoder)
+        n_non_padding = batch['decoder_attention_mask'].sum(dim=1).max()
+        keep_tokens = int(multiple_of * torch.ceil(n_non_padding / multiple_of))
+        for k in ['decoder_input_ids', 'decoder_attention_mask', 'labels']:
+            batch[k] = batch[k][:, :keep_tokens].contiguous()
+        return batch
+def ensure_list(x: Union[List, torch.Tensor]) -> List:
+    if isinstance(x, torch.Tensor):
+        x = list(x.flatten())
+    assert isinstance(x, list)
+    return x

Perceptrix/finetune/build/lib/llmfoundry/data/finetuning/dataloader.py ADDED Viewed

	@@ -0,0 +1,516 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+from typing import Tuple, Union
+import datasets as hf_datasets
+import torch
+from composer.utils import dist, get_file, parse_uri
+from omegaconf import DictConfig
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
+from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator
+from llmfoundry.data.finetuning.tasks import dataset_constructor
+from llmfoundry.data.packing import BinPackWrapper
+log = logging.getLogger(__name__)
+# HuggingFace hardcodes the ignore index to -100
+_HF_IGNORE_INDEX = -100
+def build_finetuning_dataloader(cfg: DictConfig,
+                                tokenizer: PreTrainedTokenizerBase,
+                                device_batch_size: int) -> DataLoader:
+    """Builds a finetuning dataloader for training or evaluating.
+    The underlying dataset can be built through one of two code paths:
+        1. As a HuggingFace dataset, via `datasets.load_dataset(...)`
+        2. As a streaming dataset
+    You will need to set slightly different dataset config fields depending
+    on which you intend to use, as explained below.
+    Args:
+        cfg (DictConfig): An omegaconf dictionary used to configure the loader:
+            cfg.name (str): The type of dataloader to build. Must = "finetuning".
+            ---
+            *** HuggingFace dataset config fields ***
+            cfg.dataset.hf_name (str, optional): The name of the HuggingFace dataset
+                to use. Can also be a remote http(s) directory or object store bucket
+                containing the file {split}.jsonl in the format (prompt, response),
+                in which case the builder will create a HuggingFace dataset.
+            cfg.dataset.hf_kwargs (DictConfig, optional): Additional kwargs to
+                pass to `datasets.load_dataset`, which can be used to load
+                a dataset from local files.
+            cfg.dataset.preprocessing_fn (str, optional): The name/import path of
+                the preprocessing function to use for formatting the data examples.
+                If ``None`` (default), the builder will use the preprocessing function
+                    registered under `hf_name` (see `tasks.py`), if one exists,
+                    otherwise it will skip preprocessing.
+                If `preprocessing_fn` corresponds to a registered preprocessing
+                    function in `tasks.py`, the builder will use that.
+                Otherwise, it will interpret `preprocessing_fn` as a
+                    "import.path:function_name" import path; e.g., it will call
+                    `from import.path import function_name` and use the imported
+                    function as the preprocessing function.
+            *** Streaming dataset config fields ***
+            cfg.dataset.remote (str, optional): Location of a MDS-formatted
+                streaming dataset to use. Setting this will tell the builder
+                to create a streaming dataset rather than a HuggingFace dataset.
+            cfg.dataset.local (str, optional): Local path where remote data
+                will be streamed to. Only valid if `cfg.dataset.remote` has
+                also been set.
+            *** Shared dataset configs fields ***
+            cfg.dataset.max_seq_len (int): The maximum length of sequences
+                in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
+                for details.
+            cfg.dataset.decoder_only_format (bool): Whether to format the
+                examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
+                docstring for details.
+            cfg.dataset.allow_pad_trimming (bool, optional): Whether to allow
+                the collator to trim padding. See :class:`Seq2SeqFinetuningCollator`
+                docstring for details. Default: ``False``.
+            cfg.dataset.packing_ratio (float, optional): If provided, this invokes
+                a collator wrapper that packs `device_batch_size*packing_ratio`
+                raw examples into `device_batch_size` packed examples. This helps
+                minimize padding while preserving sequence integrity.
+                This adds `sequence_id` to the batch, which indicates which unique
+                sequence each token belongs to.
+                Note: Using this feature will not change device_batch_size but it
+                    will determine the number of raw examples consumed by the dataloader
+                    per batch. Some examples may be discarded if they do not fit when
+                    packing.
+                    Select `packing_ratio` **carefully** based on the dataset
+                    statistics, `max_seq_len`, and tolerance for discarding samples!
+                    The packing code in `../packing.py` provides a script that can help
+                    you choose the best `packing_ratio`.
+            cfg.dataset.shuffle (bool): Whether to shuffle the dataset.
+            ___
+            See :class:`StreamingFinetuningDataset` for info on other standard config
+                options within `cfg.dataset` that will be passed as kwargs if
+                using the streaming codepath.
+            ---
+            See :class:`DataLoader` for standard argument options to the pytorch
+                dataloader, such as `cfg.drop_last`, `cfg.num_workers`, etc.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
+            prepare the data from raw text. Any missing sentinel tokens will
+            be added by the collator.
+        device_batch_size (int): The size of the batches (number of examples)
+            that the dataloader will produce.
+    Returns:
+        A pytorch dataloader
+    Note:
+        You can run the script inside `../packing.py` to quickly test the
+        padding/waste rates for different `cfg.dataset.packing_ratio` choices,
+        given a starting workload YAML.
+    """
+    _validate_config(cfg.dataset)
+    # Use EOS as the pad token if none exists
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    dataset = None  # for pyright
+    if cfg.dataset.get('remote') is not None:
+        dataset = dataset_constructor.build_from_streaming(
+            tokenizer=tokenizer,
+            local=cfg.dataset.local,
+            remote=cfg.dataset.get('remote', None),
+            split=cfg.dataset.get('split', None),
+            download_retry=cfg.dataset.get('download_retry', 2),
+            download_timeout=cfg.dataset.get('download_timeout', 60),
+            validate_hash=cfg.dataset.get('validate_hash', None),
+            keep_zip=cfg.dataset.get('keep_zip', False),
+            epoch_size=cfg.dataset.get('epoch_size', None),
+            predownload=cfg.dataset.get('predownload', None),
+            cache_limit=cfg.dataset.get('cache_limit', None),
+            partition_algo=cfg.dataset.get('partition_algo', 'orig'),
+            num_canonical_nodes=cfg.dataset.get('num_canonical_nodes', None),
+            batch_size=device_batch_size,
+            shuffle=cfg.dataset.get('shuffle', False),
+            shuffle_algo=cfg.dataset.get('shuffle_algo', 'py1b'),
+            shuffle_seed=cfg.dataset.get('shuffle_seed', 9176),
+            shuffle_block_size=cfg.dataset.get('shuffle_block_size', 1 << 18),
+            sampling_method=cfg.dataset.get('sampling_method', 'balanced'),
+            sampling_granularity=cfg.dataset.get('sampling_granularity', 1),
+            batching_method=cfg.dataset.get('batching_method', 'random'),
+        )
+        collate_fn, dataloader_batch_size = _build_collate_fn(
+            cfg.dataset, tokenizer, device_batch_size)
+        return DataLoader(
+            dataset,
+            collate_fn=collate_fn,
+            batch_size=dataloader_batch_size,
+            drop_last=cfg.drop_last,
+            num_workers=cfg.num_workers,
+            pin_memory=cfg.get('pin_memory', True),
+            prefetch_factor=cfg.get('prefetch_factor', 2),
+            persistent_workers=cfg.get('persistent_workers', True),
+            timeout=cfg.get('timeout', 0),
+        )
+    else:
+        backend, _, _ = parse_uri(cfg.dataset.hf_name)
+        if backend not in ['', None]:
+            if cfg.dataset.get('split') is None:
+                raise ValueError(
+                    'When using a HuggingFace dataset from a URL, you must set the ' + \
+                    '`split` key in the dataset config.'
+                )
+            dataset = _build_hf_dataset_from_remote(cfg, tokenizer)
+        else:
+            dataset = dataset_constructor.build_from_hf(
+                cfg.dataset,
+                max_seq_len=cfg.dataset.max_seq_len,
+                tokenizer=tokenizer,
+            )
+        collate_fn, dataloader_batch_size = _build_collate_fn(
+            cfg.dataset, tokenizer, device_batch_size)
+        if cfg.drop_last:
+            world_size = dist.get_world_size()
+            minimum_dataset_size = world_size * dataloader_batch_size
+            if hasattr(dataset, '__len__'):
+                full_dataset_size = len(dataset)
+                if full_dataset_size < minimum_dataset_size:
+                    raise ValueError(
+                        f'Your dataset (name={cfg.dataset.hf_name}, split={cfg.dataset.split}) '
+                        +
+                        f'has {full_dataset_size} samples, but your minimum batch size '
+                        +
+                        f'is {minimum_dataset_size} because you are running on {world_size} gpus and '
+                        +
+                        f'your per device batch size is {dataloader_batch_size}. Please increase the number '
+                        +
+                        f'of samples in your dataset to at least {minimum_dataset_size}.'
+                    )
+        assert dataset is not None
+        return DataLoader(
+            dataset,
+            collate_fn=collate_fn,
+            batch_size=dataloader_batch_size,
+            drop_last=cfg.drop_last,
+            sampler=dist.get_sampler(dataset,
+                                     drop_last=cfg.drop_last,
+                                     shuffle=cfg.dataset.shuffle),
+            num_workers=cfg.num_workers,
+            pin_memory=cfg.get('pin_memory', True),
+            prefetch_factor=cfg.get('prefetch_factor', 2),
+            persistent_workers=cfg.get('persistent_workers', True),
+            timeout=cfg.get('timeout', 0),
+        )
+def _validate_config(dataset_cfg: DictConfig) -> None:
+    """Validates the dataset configuration.
+    Makes sure that the dataset is properly configured for either
+    a HuggingFace dataset or a streaming dataset. Must be valid for one or
+    the other.
+    Args:
+        dataset_cfg (DictConfig): The dataset configuration to be validated.
+    Raises:
+        ValueError: If the dataset configuration does not meet the requirements.
+    """
+    if dataset_cfg.get('hf_name') is not None:
+        # Using the HuggingFace dataset codepath
+        illegal_keys = ['local', 'remote']
+        discovered_illegal_keys = []
+        for key in illegal_keys:
+            if dataset_cfg.get(key) is not None:
+                discovered_illegal_keys.append('`' + key + '`')
+        if discovered_illegal_keys:
+            raise ValueError(
+                'The dataset config sets a value for `hf_name` as well as the ' +\
+                f'following keys: {", ".join(discovered_illegal_keys)}.\n' +\
+                'Those keys are used when building from a streaming dataset, but ' +\
+                'setting `hf_name` instructs the dataset to build from a HuggingFace dataset.'
+            )
+    elif dataset_cfg.get('remote') is not None:
+        # Using the streaming dataset codepath
+        illegal_keys = ['hf_name', 'hf_kwargs', 'preprocessing_fn']
+        discovered_illegal_keys = []
+        for key in illegal_keys:
+            if dataset_cfg.get(key) is not None:
+                discovered_illegal_keys.append('`' + key + '`')
+        if discovered_illegal_keys:
+            raise ValueError(
+                'The dataset config sets a value for `remote` as well as the ' +\
+                f'following keys: {", ".join(discovered_illegal_keys)}.\n' +\
+                'Those keys are used when building from a HuggingFace dataset, but ' +\
+                'setting `remote` instructs the dataset to build from a streaming dataset.'
+            )
+        if dataset_cfg.get('local') is None:
+            raise ValueError(
+                'Using a streaming dataset requires setting both `remote` and `local`, ' +\
+                'but dataset.local is None.'
+            )
+    else:
+        raise ValueError(
+            'In the dataset config, you must set either `hf_name` to use a ' +\
+            'HuggingFace dataset or set `remote` to use a streaming ' +\
+            'dataset, but both were None.'
+        )
+def _build_hf_dataset_from_remote(
+    cfg: DictConfig, tokenizer: PreTrainedTokenizerBase
+) -> Union[hf_datasets.DatasetDict, hf_datasets.Dataset,
+           hf_datasets.IterableDatasetDict, hf_datasets.IterableDataset]:
+    """Builds a dataset from a remote object store.
+    This function supports 'jsonl', 'csv', and 'parquet' file formats for the dataset. It will attempt to download
+    the dataset, then once it is downloaded, convert it into HuggingFace ``datasets`` format, and then return this
+    dataset.
+    The function also ensures synchronicity across multiple processes during the file download. It creates a signal
+    file that is used to synchronize the start of the download across different processes. Once the download is
+    completed, the function removes the signal file.
+    Args:
+        cfg (DictConfig): The configuration dictionary containing the necessary parameters to load the dataset.
+            This includes:
+                - dataset.hf_name: The path of the HuggingFace dataset to download.
+                - dataset.split: The dataset split to download (e.g., 'train', 'validation', 'test').
+                - dataset.max_seq_len: The maximum sequence length for tokenizing the dataset.
+        tokenizer (Tokenizer): The tokenizer to be used to tokenize the dataset.
+    Returns:
+        Dataset: A HuggingFace dataset built from the remote file, prepared and tokenized for fine-tuning the model.
+    Raises:
+        FileNotFoundError: Raised if the dataset file cannot be found with any of the supported extensions.
+    """
+    supported_extensions = ['jsonl', 'csv', 'parquet']
+    # HF datasets does not support a split with dashes, so we replace dashes
+    # with underscores in the destination split.
+    destination_split = cfg.dataset.split.replace('-', '_')
+    finetune_dir = os.path.join(
+        os.path.dirname(
+            os.path.dirname(os.path.dirname(os.path.realpath(__file__)))),
+        'downloaded_finetuning',
+        destination_split if destination_split != 'data' else 'data_not',
+    )
+    os.makedirs(finetune_dir, exist_ok=True)
+    for extension in supported_extensions:
+        name = f'{cfg.dataset.hf_name.strip("/")}/{cfg.dataset.split}.{extension}'
+        destination = str(
+            os.path.abspath(
+                os.path.join(
+                    finetune_dir, 'data',
+                    f'{destination_split}-00000-of-00001.{extension}')))
+        # Since we don't know exactly what the extension will be, since it is one of a list
+        # use a signal file to wait for instead of the desired file
+        signal_file_path = os.path.join(
+            finetune_dir, f'.node_{dist.get_node_rank()}_local_rank0_completed')
+        if dist.get_local_rank() == 0:
+            try:
+                get_file(path=name, destination=destination, overwrite=True)
+            except FileNotFoundError as e:
+                if extension == supported_extensions[-1]:
+                    files_searched = [
+                        f'{cfg.dataset.hf_name}/{cfg.dataset.split}.{ext}'
+                        for ext in supported_extensions
+                    ]
+                    raise FileNotFoundError(
+                        f'Could not find a file with any of ' + \
+                        f'the supported extensions: {supported_extensions}\n' + \
+                        f'at {files_searched}'
+                    ) from e
+                else:
+                    log.debug(
+                        f'Could not find {name}, looking for another extension')
+                continue
+            os.makedirs(os.path.dirname(signal_file_path), exist_ok=True)
+            with open(signal_file_path, 'wb') as f:
+                f.write(b'local_rank0_completed_download')
+        # Avoid the collective call until the local rank zero has finished trying to download the checkpoint
+        # so that we don't timeout for large downloads. This syncs all processes on the node
+        with dist.local_rank_zero_download_and_wait(signal_file_path):
+            # Then, wait to ensure every node has finished downloading the checkpoint
+            dist.barrier()
+        # clean up signal file
+        if dist.get_local_rank() == 0:
+            os.remove(signal_file_path)
+        dist.barrier()
+        cfg.dataset.hf_name = finetune_dir
+        log.info(cfg.dataset)
+        dataset = dataset_constructor.build_from_hf(
+            cfg.dataset,
+            max_seq_len=cfg.dataset.max_seq_len,
+            tokenizer=tokenizer,
+        )
+        return dataset
+def _build_collate_fn(
+    dataset_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+    device_batch_size: int
+) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackWrapper], int]:
+    collate_fn = Seq2SeqFinetuningCollator(
+        tokenizer=tokenizer,
+        max_seq_len=dataset_cfg.max_seq_len,
+        decoder_only_format=dataset_cfg.decoder_only_format,
+        allow_pad_trimming=dataset_cfg.get('allow_pad_trimming', False),
+    )
+    packing_ratio = dataset_cfg.get('packing_ratio')
+    if packing_ratio is None:
+        if dataset_cfg.get('max_leftover_bins_to_keep') is not None:
+            raise ValueError(
+                'dataset.max_leftover_bins_to_keep has been defined, ' +\
+                'but dataset.packing_ratio has not been set. Please set ' +\
+                'the latter to turn on packing or remove the former from the config.')
+        return collate_fn, device_batch_size
+    if packing_ratio == 1.0:
+        return collate_fn, device_batch_size
+    elif packing_ratio < 1.0:
+        raise ValueError('packing_ratio must be >= 1, if supplied')
+    if not dataset_cfg.decoder_only_format:
+        raise NotImplementedError(
+            'On-the-fly packing is currently only supported for decoder-only formats.'
+        )
+    collate_fn = BinPackWrapper(
+        collator=collate_fn,
+        target_batch_size=device_batch_size,
+        max_seq_len=dataset_cfg.max_seq_len,
+        pad_token_id=tokenizer.pad_token_id,
+        padding_side=tokenizer.padding_side,
+        max_leftover_bins_to_keep=dataset_cfg.get('max_leftover_bins_to_keep'),
+    )
+    n_examples_to_pack = int(device_batch_size * packing_ratio)
+    return collate_fn, n_examples_to_pack
+if __name__ == '__main__':
+    import torch
+    from omegaconf import OmegaConf as om
+    from llmfoundry.utils import build_tokenizer
+    cfg = om.create({
+        'dataset': {
+            'hf_name':
+                'tatsu-lab/alpaca',
+            'preprocessing_fn':
+                'llmfoundry.data.finetuning.tasks:alpaca_preprocessing_function',
+            'split':
+                'train',
+            'packing_ratio':
+                18.0,
+            'max_seq_len':
+                2048,
+            'decoder_only_format':
+                True,
+            'separator_text':
+                False,
+            'allow_pad_trimming':
+                False,
+            'num_canonical_nodes':
+                472,
+            'shuffle':
+                True,
+        },
+        'drop_last': False,
+        'num_workers': 0,
+        'pin_memory': False,
+        'prefetch_factor': 2,
+        'persistent_workers': False,
+        'timeout': 0
+    })
+    tokenizer_name = 'EleutherAI/gpt-neox-20b'
+    tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len}
+    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
+    device_batch_size = 2
+    dataloader = build_finetuning_dataloader(cfg, tokenizer, device_batch_size)
+    packing = cfg.dataset.get('packing_ratio') is not None
+    for i, batch in enumerate(dataloader):
+        if i >= 5:
+            break
+        print(f'-----Batch {i}-----')
+        for k, v in batch.items():
+            if isinstance(v, torch.Tensor):
+                print(k, v.shape)
+            else:
+                print(k, v)
+        for j in range(device_batch_size):
+            print(f'--- Sample {j} ---')
+            if cfg.dataset.decoder_only_format:
+                if packing:
+                    for subseq in range(int(batch['sequence_id'][j].max()) + 1):
+                        is_subseq = batch['sequence_id'][j] == subseq
+                        print(
+                            '\033[93m{}\033[00m\n'.format('INPUT IDS:'),
+                            tokenizer.decode(batch['input_ids'][
+                                j,
+                                torch.logical_and(
+                                    is_subseq, batch['attention_mask'][j] ==
+                                    1)],
+                                             skip_special_tokens=False))
+                        print(
+                            '\033[92m{}\033[00m\n'.format('CONTEXT:  '),
+                            tokenizer.decode(batch['input_ids'][
+                                j,
+                                torch.logical_and(
+                                    is_subseq, batch['bidirectional_mask'][j] ==
+                                    1)],
+                                             skip_special_tokens=False))
+                        print(
+                            '\033[91m{}\033[00m\n'.format('TARGET:   '),
+                            tokenizer.decode(batch['input_ids'][
+                                j,
+                                torch.logical_and(
+                                    is_subseq,
+                                    batch['labels'][j] != _HF_IGNORE_INDEX)],
+                                             skip_special_tokens=False))
+                else:
+                    print(
+                        '\033[93m{}\033[00m\n'.format('INPUT IDS:'),
+                        tokenizer.decode(
+                            batch['input_ids'][j,
+                                               batch['attention_mask'][j] == 1],
+                            skip_special_tokens=False))
+                    print(
+                        '\033[92m{}\033[00m\n'.format('CONTEXT:  '),
+                        tokenizer.decode(batch['input_ids'][
+                            j, batch['bidirectional_mask'][j] == 1],
+                                         skip_special_tokens=False))
+                    print(
+                        '\033[91m{}\033[00m\n'.format('TARGET:   '),
+                        tokenizer.decode(batch['input_ids'][
+                            j, batch['labels'][j] != _HF_IGNORE_INDEX],
+                                         skip_special_tokens=False))
+            else:
+                print(
+                    '\033[92m{}\033[00m\n'.format('CONTEXT:  '),
+                    tokenizer.decode(
+                        batch['input_ids'][j, batch['attention_mask'][j] == 1],
+                        skip_special_tokens=False))
+                print(
+                    '\033[91m{}\033[00m\n'.format('TARGET:   '),
+                    tokenizer.decode(batch['labels'][
+                        j, batch['decoder_attention_mask'][j] == 1],
+                                     skip_special_tokens=False))
+        print('   ')

Perceptrix/finetune/build/lib/llmfoundry/data/finetuning/tasks.py ADDED Viewed

	@@ -0,0 +1,433 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Includes code for task-specific seq-to-seq data formatting.
+This file provides some templates/examples of preprocessing functions
+that format examples for use in seq-to-seq finetuning tasks.
+These preprocessing functions take individual examples that contain raw
+text and process them into formatted examples.
+These functions have this basic structure:
+    def preprocessing_fn(example: Dict) -> Dict[str, str]:
+        # code to extract prompt/response from `example`
+        ...
+        return {
+            'prompt': <prompt>,
+            'response': <response>,
+        }
+where `<prompt>` is a placeholder for the prompt text string that you
+extracted from the input example, and '<response>' is a placeholder for
+the response text string.
+Just to be clear, "prompt" represents the text you would give the model
+at inference time, and "response" represents the text you are training
+it to produce given the prompt.
+The key requirement of these functions is that they return a dictionary
+with "prompt" and "response" keys, and that the values associated with
+those keys are strings (i.e. text).
+"""
+import importlib
+import logging
+import os
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+import datasets as hf_datasets
+from omegaconf import DictConfig
+from streaming import StreamingDataset
+from transformers import PreTrainedTokenizerBase
+log = logging.getLogger(__name__)
+__all__ = ['dataset_constructor']
+def _tokenize_formatted_example(
+        example: Dict[str, Any],
+        tokenizer: PreTrainedTokenizerBase) -> Dict[str, List[int]]:
+    if ('prompt' not in example) or ('response' not in example):
+        raise KeyError(
+            'Unable to tokenize example because it has not been properly formatted. ' +\
+            '"prompt" and "response" are required keys but at least one was missing ' +\
+            f'from {example=}.'
+        )
+    if not isinstance(example['prompt'], str):
+        raise TypeError(
+            f'Unable to tokenize example because "prompt" was not a string. {example=}'
+        )
+    if not isinstance(example['response'], str):
+        raise TypeError(
+            f'Unable to tokenize example because "response" was not a string. {example=}'
+        )
+    return tokenizer(text=example['prompt'], text_target=example['response'])
+class StreamingFinetuningDataset(StreamingDataset):
+    """Finetuning dataset with flexible tokenization using StreamingDataset.
+    Args:
+        tokenizer (Tokenizer): The name of the HuggingFace tokenizer to use to
+            tokenize samples.
+        local (str): Local dataset directory where shards are cached by split.
+        remote (str, optional): Remote path or directory to download the dataset from. If ``None``,
+            its data must exist locally. StreamingDataset uses either ``streams`` or
+            ``remote``/``local``. Defaults to ``None``.
+        split (str, optional): Which dataset split to use, if any. If provided, we stream from/to
+            the ``split`` subdirs of  ``remote`` and ``local``. Defaults to ``None``.
+        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
+        download_timeout (float): Number of seconds to wait for a shard to download before raising
+            an exception. Defaults to ``60``.
+        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
+            shards. Defaults to ``None``.
+        keep_zip (bool): Whether to keep or delete the compressed form when decompressing
+            downloaded shards. If ``False``, keep iff remote is local or no remote. Defaults to
+            `False``.
+        epoch_size (int, optional): Number of samples to draw per epoch balanced across all
+            streams. If ``None``, takes its value from the total number of underlying samples.
+            Provide this field if you are weighting streams relatively to target a larger or
+            smaller epoch size. Defaults to ``None``.
+        predownload (int, optional): Target number of samples ahead to download the shards of while
+            iterating. Defaults to ``100_000``.
+        cache_limit (Union[int, str], optional) - Maximum size in bytes of this StreamingDataset's
+            shard cache. Before downloading a shard, the least recently used resident shard(s) may
+            be evicted (deleted from the local cache) in order to stay under the limit. Set to None
+            to disable shard eviction. Supports integer bytes as well as string human-readable
+            bytes (e.g., 100b, 64kb, 77mb, and so on). Defaults to None.
+        partition_algo (str): Which partitioning algorithm to use. Defaults to ``orig``.
+        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with
+            resumption. Defaults to ``None``, which is interpreted as the number of nodes of the
+            initial run.
+        batch_size (int, optional): Batch size of its DataLoader, which affects how the dataset is
+            partitioned over the workers. Defaults to ``None``.
+        shuffle (bool): Whether to iterate over the samples in randomized order. Defaults to
+            ``False``.
+        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1b``.
+        shuffle_seed (int): Seed for Deterministic data shuffling. Defaults to ``9176``.
+        shuffle_block_size (int): Unit of shuffle. Defaults to ``1 << 18``.
+        sampling_method (str): Which sampling method to use, either ``balanced`` or ``fixed``.
+            Defaults to ``balanced``.
+        sampling_granularity (int): When picking samples for a stream's final partial repeat,
+            how many samples to pick from the same shard at a time (``1`` for evenly balanced
+            across shards, ``1000`` to pick 1000 samples from the same shard at a time, etc).
+            Defaults to ``1``.
+        batching_method (str): Which batching method to use, either ``random``, ``stratified``, or
+            ``per_stream``. Defaults to ``random``.
+    """
+    def __init__(self,
+                 tokenizer: PreTrainedTokenizerBase,
+                 local: str,
+                 remote: Optional[str] = None,
+                 split: Optional[str] = None,
+                 download_retry: int = 2,
+                 download_timeout: float = 60,
+                 validate_hash: Optional[str] = None,
+                 keep_zip: bool = False,
+                 epoch_size: Optional[int] = None,
+                 predownload: Optional[int] = None,
+                 cache_limit: Optional[Union[int, str]] = None,
+                 partition_algo: str = 'orig',
+                 num_canonical_nodes: Optional[int] = None,
+                 batch_size: Optional[int] = None,
+                 shuffle: bool = False,
+                 shuffle_algo: str = 'py1b',
+                 shuffle_seed: int = 9176,
+                 shuffle_block_size: int = 1 << 18,
+                 sampling_method: str = 'balanced',
+                 sampling_granularity: int = 1,
+                 batching_method: str = 'random',
+                 **kwargs: Any):
+        if len(kwargs) > 0:
+            raise ValueError(
+                f'StreamingFinetuningDataset() got an unexpected keyword argument: {kwargs}'
+            )
+        if remote is None or (local == remote):
+            if os.path.isdir(local):
+                contents = set(os.listdir(local))
+                if split not in contents:
+                    raise ValueError(
+                        f'local directory {local} does not contain split {split}'
+                    )
+        # Build Dataset
+        super().__init__(
+            local=local,
+            remote=remote,
+            split=split,
+            download_retry=download_retry,
+            download_timeout=download_timeout,
+            validate_hash=validate_hash,
+            keep_zip=keep_zip,
+            epoch_size=epoch_size,
+            predownload=predownload,
+            cache_limit=cache_limit,
+            partition_algo=partition_algo,
+            num_canonical_nodes=num_canonical_nodes,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            shuffle_algo=shuffle_algo,
+            shuffle_seed=shuffle_seed,
+            shuffle_block_size=shuffle_block_size,
+            sampling_method=sampling_method,
+            sampling_granularity=sampling_granularity,
+            batching_method=batching_method,
+        )
+        self.tokenizer = tokenizer
+    # How to process a sample
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        sample = super().__getitem__(idx)
+        return _tokenize_formatted_example(sample, tokenizer=self.tokenizer)
+class DatasetConstructor:
+    def __init__(self):
+        self._task_preprocessing_registry: Dict[str, Callable] = {}
+    def register(self, *names: str) -> Callable[[Callable], Callable]:
+        """Decorator for registering preprocessing functions."""
+        def _register_func(name: str, func: Callable) -> None:
+            if name in self._task_preprocessing_registry:
+                raise ValueError(
+                    f'A tokenization function has already been registered with {name=}.'
+                )
+            self._task_preprocessing_registry[name] = func
+            return
+        def wrapper(func: Callable) -> Callable:
+            for name in names:
+                _register_func(name, func)
+            return func
+        return wrapper
+    def print_registered_tasks(self) -> None:
+        tasks = sorted(self._task_preprocessing_registry.keys())
+        print('\n'.join(tasks))
+    def get_preprocessing_fn_from_dict(
+        self, mapping: Union[Dict, DictConfig]
+    ) -> Callable[[Dict[str, Any]], Dict[str, str]]:
+        """Get a preprocessing function from a dictionary.
+        The dictionary maps column names in the dataset to "prompt" and "response".
+        For example,
+            ```yaml
+            preprocessing_fn:
+                prompt: text
+                response: summary
+            ```
+        would map the `text` column as to prompt and the `summary` column as the response.
+        Args:
+            mapping (dict): A dictionary mapping column names to "prompt" and "response".
+        Returns:
+            Callable: The preprocessing function.
+        Raises:
+            ValueError: If the mapping does not have keys "prompt" and "response".
+        """
+        def _preprocessor(example: Dict[str, Any]) -> Dict[str, str]:
+            if list(mapping.keys()) != ['prompt', 'response']:
+                raise ValueError(
+                    f'Expected {mapping=} to have keys "prompt" and "response".'
+                )
+            return {
+                'prompt': example[mapping['prompt']],
+                'response': example[mapping['response']]
+            }
+        return _preprocessor
+    def get_preprocessing_fn_from_str(
+        self,
+        preprocessor: Optional[str],
+        dataset_name: Optional[str] = None
+    ) -> Optional[Callable[[Dict[str, Any]], Dict[str, str]]]:
+        """Get a preprocessing function from a string.
+        String can be either a registered function or an import path.
+        Args:
+            preprocessor (Optional[str]): The name of the preprocessing function, or an import path.
+            dataset_name (Optional[str]): The dataset name to look up in the registry.
+        Returns:
+            Callable: The preprocessing function or None if not found.
+        Raises:
+            ValueError: If the preprocessing function import from the provided string fails.
+        """
+        if preprocessor is None:
+            if dataset_name is None:
+                return None
+            if dataset_name in self._task_preprocessing_registry:
+                log.info(
+                    f'Re-formatting dataset with "{dataset_name}" preprocessing function.'
+                )
+                return self._task_preprocessing_registry[dataset_name]
+            else:
+                log.info('No preprocessor was supplied and no preprocessing function ' +\
+                        f'is registered for dataset name "{dataset_name}". No additional ' +\
+                        'preprocessing will be applied. If the dataset is already formatted ' +\
+                        'correctly, you can ignore this message.')
+                return None
+        if preprocessor in self._task_preprocessing_registry:
+            log.info(
+                f'Re-formatting dataset with "{preprocessor}" preprocessing function.'
+            )
+            return self._task_preprocessing_registry[preprocessor]
+        try:
+            import_path, function_name = preprocessor.split(':', maxsplit=1)
+            module = importlib.import_module(import_path)
+            preprocessing_fn = getattr(module, function_name)
+        except Exception as e:
+            raise ValueError(
+                f'Failed to import preprocessing function from string = {preprocessor}.'
+            ) from e
+        return preprocessing_fn
+    def build_from_hf(
+        self, cfg: DictConfig, max_seq_len: int,
+        tokenizer: PreTrainedTokenizerBase
+    ) -> Union[hf_datasets.DatasetDict, hf_datasets.Dataset,
+               hf_datasets.IterableDatasetDict, hf_datasets.IterableDataset]:
+        """Load a HuggingFace Datasets, preprocess, and tokenize.
+        Note: This function will drop examples where the prompt is longer than the max_seq_len
+        Args:
+            cfg (DictConfig): The dataset configuration.
+            max_seq_len (int): The maximum sequence length. Examples with prompts longer than this will be dropped.
+            tokenizer (Tokenizer): The tokenizer to be used for tokenizing the dataset.
+        Returns:
+            Dataset: The tokenized dataset.
+        """
+        dataset_name = cfg.hf_name
+        # HF datasets does not support a split with dashes,so we replace split
+        # dashes with underscore.
+        split = cfg.split.replace('-', '_')
+        kwargs = cfg.get('hf_kwargs', {})
+        proto_preprocessing_fn = cfg.get('preprocessing_fn')
+        if isinstance(proto_preprocessing_fn, dict) or isinstance(
+                proto_preprocessing_fn, DictConfig):
+            preprocessing_fn = self.get_preprocessing_fn_from_dict(
+                proto_preprocessing_fn)
+        else:
+            preprocessing_fn = self.get_preprocessing_fn_from_str(
+                proto_preprocessing_fn, dataset_name)
+        dataset = hf_datasets.load_dataset(dataset_name, split=split, **kwargs)
+        def dataset_mapper(example: Dict):
+            if preprocessing_fn is not None:
+                example = preprocessing_fn(example)
+            return _tokenize_formatted_example(example, tokenizer)
+        columns_to_remove = list(dataset[0].keys())
+        tokenized_dataset = dataset.map(
+            dataset_mapper,
+            batched=False,
+            remove_columns=columns_to_remove,
+        )
+        prompt_length_filtered_dataset = tokenized_dataset.filter(
+            lambda example: len(example['input_ids']) < max_seq_len)
+        examples_removed = len(tokenized_dataset) - len(
+            prompt_length_filtered_dataset)
+        if examples_removed > 0:
+            warnings.warn(
+                f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}.'
+            )
+        empty_examples_dropped_dataset = prompt_length_filtered_dataset.filter(
+            lambda example: len(example['input_ids']) > 0 and len(example[
+                'labels']) > 0 and any(token_id != tokenizer.pad_token_id
+                                       for token_id in example['labels']))
+        empty_examples_removed = len(prompt_length_filtered_dataset) - len(
+            empty_examples_dropped_dataset)
+        if empty_examples_removed > 0:
+            warnings.warn(
+                f'Dropped {empty_examples_removed} examples where the prompt or response was empty, '
+                + 'or the response was only padding tokens.')
+        return empty_examples_dropped_dataset
+    def build_from_streaming(self, *args: Any,
+                             **kwargs: Any) -> StreamingFinetuningDataset:
+        return StreamingFinetuningDataset(*args, **kwargs)
+dataset_constructor = DatasetConstructor()
+@dataset_constructor.register('tatsu-lab/alpaca')
+def alpaca_preprocessing_function(inp: Dict) -> Dict[str, str]:
+    """Split out prompt/response from text."""
+    try:
+        prompt, response = inp['text'].split('### Response:')
+        prompt += '### Response:'
+    except Exception as e:
+        raise ValueError(
+            f"Unable to extract prompt/response from 'text'={inp['text']}"
+        ) from e
+    return {'prompt': prompt, 'response': response}
+@dataset_constructor.register('HuggingFaceH4/databricks_dolly_15k')
+def dolly_preprocessing_function(inp: Dict) -> Dict[str, str]:
+    """Format the text string."""
+    PROMPT_FORMAT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n'
+    try:
+        if inp['input'] != '':
+            instruction = inp['instruction'] + '\n' + inp['input']
+        else:
+            instruction = inp['instruction']
+        prompt = PROMPT_FORMAT.format(instruction=instruction)
+        response = inp['output']
+    except Exception as e:
+        raise ValueError(
+            f'Unable to extract prompt/response from {inp=}') from e
+    return {'prompt': prompt, 'response': response}
+@dataset_constructor.register('bigscience/P3')
+def p3_preprocessing_function(inp: Dict) -> Dict[str, str]:
+    """Format the already-split example."""
+    return {
+        'prompt': inp['inputs'] + ':',
+        'response': inp['targets'],
+    }
+# Muennighoff's P3 and flan datasets share a similar convention
+@dataset_constructor.register('Muennighoff/P3', 'Muennighoff/flan')
+def muennighoff_tokenize_function(inp: Dict) -> Dict[str, str]:
+    """Format the already-split example."""
+    try:
+        prompt: str = inp['inputs']
+        response: str = inp['targets']
+        # Put a space before the response if needed
+        transitions = (' ', '\n', '\t')
+        if not (prompt.endswith(transitions) or
+                response.startswith(transitions)):
+            response = ' ' + response
+    except Exception as e:
+        raise ValueError(
+            f'Unable to process prompt/response from {inp=}') from e
+    return {'prompt': prompt, 'response': response}

Perceptrix/finetune/build/lib/llmfoundry/data/packing.py ADDED Viewed

	@@ -0,0 +1,423 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+import numpy as np
+import torch
+from omegaconf import DictConfig
+from transformers import PreTrainedTokenizerBase
+class BinPackWrapper:
+    """Utility collator for packing to reduce padding."""
+    def __init__(self,
+                 collator: Callable,
+                 target_batch_size: int,
+                 max_seq_len: int,
+                 pad_token_id: int,
+                 padding_side: Literal['left', 'right'],
+                 max_leftover_bins_to_keep: Optional[int] = None):
+        self.base_collator = collator
+        self.out_size = int(target_batch_size)
+        self.max_seq_len = int(max_seq_len)
+        self.pad_token_id = int(pad_token_id)
+        self.padding_side = padding_side
+        if self.out_size <= 0:
+            raise ValueError(f'{target_batch_size=} must be >0.')
+        if self.max_seq_len <= 0:
+            raise ValueError(f'{max_seq_len=} must be >0.')
+        if self.pad_token_id < 0:
+            raise ValueError(f'{pad_token_id=} must be >=0.')
+        if max_leftover_bins_to_keep is None:
+            self.max_leftover_bins_to_keep = int(10 * self.out_size)
+        elif max_leftover_bins_to_keep < 0:
+            raise ValueError(
+                f'{max_leftover_bins_to_keep=} must be >=0 or None.')
+        else:
+            self.max_leftover_bins_to_keep = int(max_leftover_bins_to_keep)
+        self.n_packed_tokens = 0
+        self.n_total_tokens = 0
+        self.n_packed_examples = 0
+        self._leftover_bins: List[Tuple[int, Dict[str, torch.Tensor]]] = []
+    @property
+    def waste(self) -> float:
+        return 1 - (self.n_packed_tokens / self.n_total_tokens)
+    @property
+    def efficiency(self) -> float:
+        return self.n_packed_tokens / (self.max_seq_len *
+                                       self.n_packed_examples)
+    def __call__(
+            self,
+            examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+        batch = self.base_collator(examples)
+        assert 'attention_mask' in batch
+        assert 'input_ids' in batch
+        for key in batch.keys():
+            assert key in [
+                'input_ids',
+                'labels',
+                'attention_mask',
+                'bidirectional_mask',
+            ]
+        # Cut everything down to size
+        sizes, trimmed_examples = [], []
+        for idx in range(batch['attention_mask'].shape[0]):
+            size, trimmed_example = extract_trim_batch_idx(batch, idx)
+            sizes.append(size)
+            trimmed_examples.append(trimmed_example)
+        # Apply our CS 101 bin packing algorithm.
+        packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = first_fit_bin_packing(
+            sizes=sizes,
+            examples=trimmed_examples,
+            num_bins=self.out_size,
+            max_bin_size=self.max_seq_len,
+            existing_bins=self._leftover_bins,
+        )
+        self.n_packed_tokens += n_packed_tokens
+        self.n_total_tokens += n_total_tokens
+        self.n_packed_examples += self.out_size
+        self._leftover_bins = leftover_bins[:self.max_leftover_bins_to_keep]
+        # Re-pad to max_seq_len and batch
+        batch = repad(packed_examples,
+                      max_seq_len=self.max_seq_len,
+                      pad_token_id=self.pad_token_id,
+                      padding_side=self.padding_side)
+        return batch
+def extract_trim_batch_idx(batch: Dict[str, torch.Tensor],
+                           idx: int) -> Tuple[int, Dict[str, torch.Tensor]]:
+    example = {k: v[idx] for k, v in batch.items()}
+    keep = example['attention_mask'] == 1
+    size = int(keep.sum())
+    trim_example = {k: v[keep] for k, v in example.items()}
+    trim_example['sequence_id'] = torch.zeros_like(trim_example['input_ids'])
+    return size, trim_example
+def combine_in_place(
+        example: Dict[str, torch.Tensor],
+        add_on: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    if 'labels' in add_on:
+        # Prevents the last token in example from being trained to
+        # predict the first token in add_on, which would make no sense.
+        add_on['labels'][0] = -100
+    for k in example.keys():
+        if k == 'sequence_id':
+            example[k] = torch.cat(
+                [example[k], add_on[k] + 1 + torch.max(example[k])])
+        else:
+            example[k] = torch.cat([example[k], add_on[k]])
+    return example
+def first_fit_bin_packing(
+    sizes: List[int], examples: List[Dict[str, torch.Tensor]], num_bins: int,
+    max_bin_size: int, existing_bins: List[Tuple[int, Dict[str, torch.Tensor]]]
+) -> Tuple[List[Dict[str, torch.Tensor]], int, int, List[Tuple[int, Dict[
+        str, torch.Tensor]]]]:
+    # Will contain tuples (bin_size_size, packed_example)
+    bins: List[Tuple[int, Dict[str, torch.Tensor]]] = existing_bins
+    starting_total_bin_sizes = sum([bin_size for bin_size, _ in bins])
+    sizes_and_examples = [
+        (size, example) for size, example in zip(sizes, examples)
+    ]
+    sorted_sizes_and_examples = sorted(sizes_and_examples,
+                                       key=lambda x: x[0],
+                                       reverse=True)
+    required_num_examples = max(0, num_bins - len(bins))
+    num_examples = len(sizes)
+    if num_examples < required_num_examples:
+        for size, example in sorted_sizes_and_examples:
+            # Can't keep packing. All remaining items get their own bin.
+            bins.append((size, example))
+        total_bin_sizes = sum([bin_size for bin_size, _ in bins])
+        total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes
+        total_example_sizes = sum(sizes)
+        if total_new_bin_sizes != total_example_sizes:
+            raise AssertionError(
+                f'Error in packing. {total_example_sizes=} does not equal {total_new_bin_sizes=}.'
+            )
+        sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True)
+        bin_sizes, packed_examples = [], []
+        for bin_size, packed_example in sorted_bins:
+            bin_sizes.append(bin_size)
+            packed_examples.append(packed_example)
+        # Return:
+        #  - the num_bins largest packed examples
+        #  - the total tokens in those examples
+        #  - the total size of all new examples
+        #  - leftover bins
+        return packed_examples[:num_bins], sum(
+            bin_sizes[:num_bins]), sum(sizes), sorted_bins[num_bins:]
+    # Go through each item from longest to shortest.
+    # Note: all items will either go into an existing or new bin.
+    for i, (size, example) in enumerate(sorted_sizes_and_examples):
+        # If we can't keep packing, all remaining items get their own bin.
+        required_num_examples = max(0, num_bins - len(bins))
+        n_remaining = num_examples - i
+        assert n_remaining >= required_num_examples
+        if n_remaining == required_num_examples:
+            # Can't keep packing. All remaining items get their own bin.
+            bins.append((size, example))
+            continue
+        # Add it to the first bin it fits in
+        added = False
+        for bidx in range(len(bins)):
+            if bins[bidx][0] + size <= max_bin_size:
+                bin_size, packed_example = bins.pop(bidx)
+                bin_size = bin_size + size
+                packed_example = combine_in_place(packed_example, example)
+                bins.append((bin_size, packed_example))
+                added = True
+                break
+        # If it didn't fit anywhere, open a new bin
+        if not added:
+            bins.append((size, example))
+    total_bin_sizes = sum([bin_size for bin_size, _ in bins])
+    total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes
+    total_example_sizes = sum(sizes)
+    if total_new_bin_sizes != total_example_sizes:
+        raise AssertionError(
+            f'Error in packing. {total_example_sizes=} does not equal {total_new_bin_sizes=}.'
+        )
+    sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True)
+    bin_sizes, packed_examples = [], []
+    for bin_size, packed_example in sorted_bins:
+        bin_sizes.append(bin_size)
+        packed_examples.append(packed_example)
+    # Return:
+    #  - the num_bins largest packed examples
+    #  - the total tokens in those examples
+    #  - the total size of all new examples
+    #  - leftover bins
+    return packed_examples[:num_bins], sum(
+        bin_sizes[:num_bins]), sum(sizes), sorted_bins[num_bins:]
+def repad(packed_examples: List[Dict[str, torch.Tensor]], max_seq_len: int,
+          pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]:
+    def pad_tensor(tensor: torch.Tensor, pad_value: int):
+        if len(tensor) == max_seq_len:
+            return tensor
+        t = torch.full((max_seq_len,),
+                       pad_value,
+                       dtype=tensor.dtype,
+                       device=tensor.device)
+        if padding_side == 'left':
+            t[-len(tensor):] = tensor
+        elif padding_side == 'right':
+            t[:len(tensor)] = tensor
+        else:
+            raise ValueError(f'Unknown {padding_side=}')
+        return t
+    pad_vals = {
+        'input_ids': pad_token_id,
+        'labels': -100,
+        'attention_mask': 0,
+        'bidirectional_mask': 0,
+        'sequence_id': -1,
+    }
+    keys = packed_examples[0].keys()
+    batch = {}
+    for key in keys:
+        batch[key] = torch.stack([
+            pad_tensor(example[key], pad_vals[key])
+            for example in packed_examples
+        ])
+    return batch
+if __name__ == '__main__':
+    from argparse import ArgumentParser, Namespace
+    from omegaconf import OmegaConf as om
+    from llmfoundry import (build_finetuning_dataloader,
+                            build_text_denoising_dataloader)
+    from llmfoundry.data import build_text_dataloader
+    from llmfoundry.utils import build_tokenizer
+    def parse_args() -> Namespace:
+        """Parse commandline arguments."""
+        parser = ArgumentParser(
+            description=
+            'Profile packing_ratio choices for a particular workload.')
+        parser.add_argument(
+            '--yaml-path',
+            type=str,
+            required=True,
+            help='Path to the YAML that defines the workload to profile.')
+        parser.add_argument('--num-devices',
+                            type=int,
+                            default=None,
+                            help='How many devices your run will use.')
+        parser.add_argument('--min',
+                            type=float,
+                            required=True,
+                            help='Smallest packing_ratio to test. Must be >=1.')
+        parser.add_argument(
+            '--max',
+            type=float,
+            required=True,
+            help='Largest packing_ratio to test. Must be larger than `min`.')
+        parser.add_argument(
+            '--num-packing-ratios',
+            type=int,
+            default=10,
+            help=
+            'Number of packing_ratio values (spaced between `min` and `max) to try.'
+        )
+        args = parser.parse_args()
+        if not os.path.isfile(args.yaml_path):
+            raise FileNotFoundError(
+                '`yaml_path` does not correspond to any existing file.')
+        if args.num_devices < 1:
+            raise ValueError('`num_devices` must be a positive integer.')
+        if args.min < 1.0:
+            raise ValueError('`min` must be >=1.0.')
+        if args.max < args.min:
+            raise ValueError('`max` cannot be less than `min`.')
+        if args.num_packing_ratios < 1:
+            raise ValueError('`num_packing_ratios` must be a positive integer.')
+        return args
+    def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+                         device_batch_size: int):
+        if cfg.name == 'text':
+            return build_text_dataloader(cfg, tokenizer, device_batch_size)
+        elif cfg.name == 'text_denoising':
+            return build_text_denoising_dataloader(cfg, tokenizer,
+                                                   device_batch_size)
+        elif cfg.name == 'finetuning':
+            return build_finetuning_dataloader(cfg, tokenizer,
+                                               device_batch_size)
+        else:
+            raise ValueError(
+                f'Not sure how to build dataloader with config: {cfg}')
+    args = parse_args()
+    with open(args.yaml_path) as f:
+        cfg = om.load(f)
+    if 'parameters' in cfg:
+        cfg = om.to_container(cfg.parameters)
+        cfg = om.create(cfg)
+    device_batch_size = cfg.global_train_batch_size // args.num_devices
+    # Determine the packing_ratio values we'll try
+    packing_ratios, raw_batch_sizes = [], []
+    for packing_ratio in np.linspace(args.min,
+                                     args.max,
+                                     args.num_packing_ratios,
+                                     endpoint=True):
+        packing_ratio = np.round(10 * packing_ratio) / 10
+        raw_batch_size = int(packing_ratio * device_batch_size)
+        if raw_batch_size not in raw_batch_sizes:
+            packing_ratios.append(packing_ratio)
+            raw_batch_sizes.append(raw_batch_size)
+    # Fetch a bunch of raw examples once, which we'll re-use
+    if 'train_loader' not in cfg:
+        raise ValueError('config must define train_loader')
+    dataloader_cfg = cfg.train_loader
+    max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep',
+                                                       None)
+    # build tokenizer
+    if 'tokenizer' not in cfg:
+        raise ValueError('config must define tokenizer')
+    resolved_tokenizer_cfg = om.to_container(cfg.tokenizer, resolve=True)
+    if not isinstance(resolved_tokenizer_cfg, Dict):
+        raise ValueError(
+            'tokenizer config needs to be resolved by omegaconf into a Dict.')
+    tokenizer_cfg: Dict[Any, Any] = resolved_tokenizer_cfg
+    tokenizer_name = tokenizer_cfg['name']
+    tokenizer_kwargs = tokenizer_cfg.get('kwargs', {})
+    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
+    # Turn off packing for the dataloader (we want raw, pre-packed examples)
+    dataloader_cfg.dataset.packing_ratio = None
+    dataloader_cfg.dataset.max_leftovers_to_keep = None
+    train_dataloader = build_dataloader(dataloader_cfg, tokenizer,
+                                        max(raw_batch_sizes) * 100)
+    # Get a bunch of raw examples
+    big_batch = next(iter(train_dataloader))
+    def split_big_batch(raw_batch_size: int) -> List:
+        input_ids = big_batch['input_ids'].split(raw_batch_size)
+        batches = [{'input_ids': x} for x in input_ids]
+        for key in big_batch.keys():
+            if key == 'input_ids':
+                continue
+            for idx, split in enumerate(big_batch[key].split(raw_batch_size)):
+                batches[idx].update({key: split})
+        return batches
+    def profile_packing(raw_batch_size: int) -> Tuple[float, float]:
+        packer = BinPackWrapper(
+            collator=lambda x: x,
+            target_batch_size=device_batch_size,
+            max_seq_len=dataloader_cfg.dataset.max_seq_len,
+            pad_token_id=0,  # <-- Doesn't need to be correct for profiling
+            padding_side='left',  # <-- Doesn't need to be correct for profiling
+            max_leftover_bins_to_keep=max_leftovers_to_keep)
+        # Simulate feeding the packing collator a bunch of data
+        for batch in split_big_batch(raw_batch_size):
+            if batch['input_ids'].shape[0] < device_batch_size:
+                continue
+            _ = packer(batch)
+        # Return the padding / waste stats over that bunch of data
+        padding_percent = 100 * (1 - packer.efficiency)
+        waste_percent = 100 * packer.waste
+        return padding_percent, waste_percent
+    header = '\n\n\n packing_ratio | % PADDING | % WASTE'
+    fstr = '        {:5.1f}  |  {:5.2f}%   | {:6.2f}%'
+    print(header)
+    print('-' * len(header))
+    for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes):
+        padding, waste = profile_packing(raw_batch_size)
+        print(fstr.format(packing_ratio, padding, waste))

Perceptrix/finetune/build/lib/llmfoundry/data/text_data.py ADDED Viewed

	@@ -0,0 +1,367 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Build a StreamingTextDataset dataset and dataloader for training."""
+import os
+from itertools import islice
+from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence,
+                    Union, cast)
+import numpy as np
+import torch
+import transformers
+from omegaconf import DictConfig
+from omegaconf import OmegaConf as om
+from streaming import Stream, StreamingDataset
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
+class StreamingTextDataset(StreamingDataset):
+    """Generic text dataset using MosaicML's StreamingDataset.
+    Args:
+        tokenizer (Tokenizer): HuggingFace tokenizer to
+            tokenize samples.
+        max_seq_len (int): The max sequence length of each sample.
+        streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from,
+            which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or
+            ``remote``/``local``. Defaults to ``None``.
+        remote (str, optional): Remote path or directory to download the dataset from. If ``None``,
+            its data must exist locally. StreamingDataset uses either ``streams`` or
+            ``remote``/``local``. Defaults to ``None``.
+        local (str, optional): Local working directory to download shards to. This is where shards
+            are cached while they are being used. Uses a temp directory if not set.
+            StreamingDataset uses either ``streams`` or ``remote``/``local``. Defaults to ``None``.
+        split (str, optional): Which dataset split to use, if any. If provided, we stream from/to
+            the ``split`` subdirs of  ``remote`` and ``local``. Defaults to ``None``.
+        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
+        download_timeout (float): Number of seconds to wait for a shard to download before raising
+            an exception. Defaults to ``60``.
+        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
+            shards. Defaults to ``None``.
+        keep_zip (bool): Whether to keep or delete the compressed form when decompressing
+            downloaded shards. If ``False``, keep iff remote is local or no remote. Defaults to
+            `False``.
+        epoch_size (int, optional): Number of samples to draw per epoch balanced across all
+            streams. If ``None``, takes its value from the total number of underlying samples.
+            Provide this field if you are weighting streams relatively to target a larger or
+            smaller epoch size. Defaults to ``None``.
+        predownload (int, optional): Target number of samples ahead to download the shards of while
+            iterating. Defaults to ``100_000``.
+        cache_limit (Union[int, str], optional) - Maximum size in bytes of this StreamingDataset's
+            shard cache. Before downloading a shard, the least recently used resident shard(s) may
+            be evicted (deleted from the local cache) in order to stay under the limit. Set to None
+            to disable shard eviction. Supports integer bytes as well as string human-readable
+            bytes (e.g., 100b, 64kb, 77mb, and so on). Defaults to None.
+        partition_algo (str): Which partitioning algorithm to use. Defaults to ``orig``.
+        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with
+            resumption. Defaults to ``None``, which is interpreted as the number of nodes of the
+            initial run.
+        batch_size (int, optional): Batch size of its DataLoader, which affects how the dataset is
+            partitioned over the workers. Defaults to ``None``.
+        shuffle (bool): Whether to iterate over the samples in randomized order. Defaults to
+            ``False``.
+        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1b``.
+        shuffle_seed (int): Seed for Deterministic data shuffling. Defaults to ``9176``.
+        shuffle_block_size (int): Unit of shuffle. Defaults to ``1 << 18``.
+        sampling_method (str): Which sampling method to use, either ``balanced`` or ``fixed``.
+            Defaults to ``balanced``.
+        sampling_granularity (int): When picking samples for a stream's final partial repeat,
+            how many samples to pick from the same shard at a time (``1`` for evenly balanced
+            across shards, ``1000`` to pick 1000 samples from the same shard at a time, etc).
+            Defaults to ``1``.
+        batching_method (str): Which batching method to use, either ``random``, ``stratified``, or
+            ``per_stream``. Defaults to ``random``.
+    """
+    def __init__(self,
+                 tokenizer: PreTrainedTokenizerBase,
+                 max_seq_len: int,
+                 streams: Optional[Sequence[Stream]] = None,
+                 remote: Optional[str] = None,
+                 local: Optional[str] = None,
+                 split: Optional[str] = None,
+                 download_retry: int = 2,
+                 download_timeout: float = 60,
+                 validate_hash: Optional[str] = None,
+                 keep_zip: bool = False,
+                 epoch_size: Optional[int] = None,
+                 predownload: int = 100_000,
+                 cache_limit: Optional[Union[int, str]] = None,
+                 partition_algo: str = 'orig',
+                 num_canonical_nodes: Optional[int] = None,
+                 batch_size: Optional[int] = None,
+                 shuffle: bool = False,
+                 shuffle_algo: str = 'py1b',
+                 shuffle_seed: int = 9176,
+                 shuffle_block_size: int = 1 << 18,
+                 sampling_method: str = 'balanced',
+                 sampling_granularity: int = 1,
+                 batching_method: str = 'random',
+                 **kwargs: Any):
+        group_method = kwargs.pop('group_method', None)
+        if group_method is not None:
+            raise NotImplementedError(
+                'group_method is deprecated and has been removed.\nTo ' +
+                'concatenate, use the --concat_tokens ' +
+                'argument when creating your MDS dataset with concat_c4.py')
+        if len(kwargs) > 0:
+            raise ValueError(
+                f'StreamingTextDataset() got an unexpected keyword argument: {kwargs}'
+            )
+        if local is not None and (remote is None or (local == remote)):
+            if os.path.isdir(local):
+                contents = set(os.listdir(local))
+                if split not in contents:
+                    raise ValueError(
+                        f'local directory {local} does not contain split {split}'
+                    )
+        # TODO: discover where yamls are being converted incorrect, but temporary workaround
+        if isinstance(shuffle_block_size, float):
+            shuffle_block_size = int(shuffle_block_size)
+        # Build Dataset
+        super().__init__(
+            streams=streams,
+            remote=remote,
+            local=local,
+            split=split,
+            download_retry=download_retry,
+            download_timeout=download_timeout,
+            validate_hash=validate_hash,
+            keep_zip=keep_zip,
+            epoch_size=epoch_size,
+            predownload=predownload,
+            cache_limit=cache_limit,
+            partition_algo=partition_algo,
+            num_canonical_nodes=num_canonical_nodes,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            shuffle_algo=shuffle_algo,
+            shuffle_seed=shuffle_seed,
+            shuffle_block_size=shuffle_block_size,
+            sampling_method=sampling_method,
+            sampling_granularity=sampling_granularity,
+            batching_method=batching_method,
+        )
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+    # How to tokenize a text sample to a token sample
+    def _tokenize(self, text_sample: Mapping) -> Dict[str, List[int]]:
+        if self.tokenizer._pad_token is None:
+            # Some tokenizers (e.g. GPT2 tokenizer) have no padding token which causes bugs
+            raise RuntimeError(
+                'If tokenizing on-the-fly, tokenizer must have a pad_token_id')
+        return self.tokenizer(text_sample['text'],
+                              truncation=True,
+                              padding='max_length',
+                              max_length=self.max_seq_len)
+    def _read_binary_tokenized_sample(self, sample: Dict[str,
+                                                         Any]) -> torch.Tensor:
+        return torch.from_numpy(
+            np.frombuffer(sample['tokens'],
+                          dtype=np.int64)[:self.max_seq_len].copy())
+    # How to process a sample
+    def __getitem__(self,
+                    idx: int) -> Union[Dict[str, List[int]], torch.Tensor]:
+        sample = super().__getitem__(idx)
+        if 'text' in sample:
+            token_sample = self._tokenize(sample)
+        elif 'tokens' in sample:
+            token_sample = self._read_binary_tokenized_sample(sample)
+        else:
+            raise RuntimeError(
+                'StreamingTextDataset needs samples to have a `text` or `tokens` column'
+            )
+        return token_sample
+class ConcatenatedSequenceCollatorWrapper:
+    """Collator wrapper to add sequence_id to batch."""
+    def __init__(
+        self,
+        base_collator: Callable,
+        eos_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
+    ):
+        self.base_collator = base_collator
+        if (eos_token_id is None) and (bos_token_id is None):
+            raise ValueError(
+                'Must supply a value for either eos_token_id or bos_token_id, but got None for both.'
+            )
+        if (eos_token_id is not None) and (bos_token_id is not None):
+            raise ValueError(
+                'Cannot use *both* EOS and BOS tokens for detecting sequence boundaries. ' +\
+                'Please supply `eos_token_id` if sequences end with an EOS token, or use ' +\
+                '`bos_token_id` if sequences start with a BOS token.'
+            )
+        if eos_token_id is None:
+            self.split_token_id = cast(int, bos_token_id)
+            self.bos_mode = True
+        else:
+            self.split_token_id = eos_token_id
+            self.bos_mode = False
+    def __call__(self, examples: List[Any]) -> Dict[str, torch.Tensor]:
+        batch = self.base_collator(examples)
+        batch['sequence_id'] = self.get_sequence_id_from_batch(batch)
+        return batch
+    def get_sequence_id_from_batch(
+            self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        is_separator = torch.eq(batch['input_ids'], self.split_token_id)
+        cumulative_sep = torch.cumsum(is_separator,
+                                      dim=1).to(batch['input_ids'].dtype)
+        # If separator token is bos, we're already done
+        if self.bos_mode:
+            return cumulative_sep
+        # If separator token is eos, right shift 1 space
+        left_zeros = cumulative_sep.new_zeros((cumulative_sep.shape[0], 1))
+        return torch.cat([left_zeros, cumulative_sep[:, :-1]], dim=1)
+def build_text_dataloader(
+    cfg: DictConfig,
+    tokenizer: PreTrainedTokenizerBase,
+    device_batch_size: int,
+) -> DataLoader:
+    assert cfg.name == 'text', f'Tried to build text dataloader with cfg.name={cfg.name}'
+    if cfg.dataset.get('group_method', None) is not None:
+        raise NotImplementedError(
+            'group_method is deprecated and has been removed.\nTo ' +
+            'concatenate, use the --concat_tokens ' +
+            'argument when creating your MDS dataset with convert_dataset_hf.py'
+        )
+    # get kwargs
+    streams_dict = cfg.dataset.pop('streams', None)
+    mlm_probability = cfg.dataset.pop('mlm_probability', None)
+    eos_token_id = cfg.dataset.pop('eos_token_id', None)
+    bos_token_id = cfg.dataset.pop('bos_token_id', None)
+    # build streams
+    streams = None
+    if streams_dict is not None:
+        streams = []
+        for _, stream in streams_dict.items():
+            # stream is the streams kwargs
+            # fwd all kwargs with **stream allows streaming to check args
+            streams.append(Stream(**stream))
+    # build dataset potentially with streams
+    dataset = StreamingTextDataset(
+        tokenizer=tokenizer,
+        streams=streams,
+        batch_size=device_batch_size,
+        **cfg.dataset,
+    )
+    collate_fn = transformers.DataCollatorForLanguageModeling(
+        tokenizer=dataset.tokenizer,
+        mlm=mlm_probability is not None,
+        mlm_probability=mlm_probability)
+    if (eos_token_id is not None) or (bos_token_id is not None):
+        # Note: Will raise an error if both are non-None
+        collate_fn = ConcatenatedSequenceCollatorWrapper(
+            base_collator=collate_fn,
+            eos_token_id=eos_token_id,
+            bos_token_id=bos_token_id)
+    return DataLoader(
+        dataset,
+        collate_fn=collate_fn,
+        batch_size=device_batch_size,
+        drop_last=cfg.drop_last,
+        num_workers=cfg.num_workers,
+        pin_memory=cfg.get('pin_memory', True),
+        prefetch_factor=cfg.get('prefetch_factor', 2),
+        persistent_workers=cfg.get('persistent_workers', True),
+        timeout=cfg.get('timeout', 0),
+    )
+# Helpful to test if your dataloader is working locally
+# Run `python data.py  --local_path [local] [--remote_path remote, optional]` and verify that batches are printed out
+if __name__ == '__main__':
+    import argparse
+    from llmfoundry.utils.builders import build_tokenizer
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--tokenizer',
+                        type=str,
+                        default='EleutherAI/gpt-neox-20b',
+                        help='the name of the tokenizer to use')
+    parser.add_argument('--local_path',
+                        type=str,
+                        required=True,
+                        help='the path to the local copy of the dataset')
+    parser.add_argument(
+        '--remote_path',
+        type=str,
+        default=None,
+        help='the path to the remote copy to stream from (optional)')
+    parser.add_argument('--split',
+                        type=str,
+                        default='val',
+                        help='which split of the dataset to use')
+    parser.add_argument('--max_seq_len',
+                        type=int,
+                        default=32,
+                        help='max sequence length to test')
+    args = parser.parse_args()
+    if args.remote_path is not None:
+        print(
+            f'Reading {args.split} split from {args.local_path} <- streamed from <- {args.remote_path}'
+        )
+    else:
+        print(f'Reading {args.split} split from {args.local_path}')
+    cfg = {
+        'name': 'text',
+        'dataset': {
+            'local': args.local_path,
+            'remote': args.remote_path,
+            'split': args.split,
+            'shuffle': False,
+            'max_seq_len': args.max_seq_len,
+            'keep_zip': True,  # in case we need compressed files after testing
+        },
+        'drop_last': False,
+        'num_workers': 4,
+    }
+    cfg = om.create(cfg)
+    device_batch_size = 2
+    tokenizer_name = args.tokenizer
+    tokenizer_kwargs = {'model_max_length': args.max_seq_len}
+    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
+    loader = build_text_dataloader(cfg, tokenizer, device_batch_size)
+    assert isinstance(loader.dataset, StreamingTextDataset)
+    tokenizer = loader.dataset.tokenizer
+    for batch_ix, batch in enumerate(islice(loader, 5)):
+        print('\n')
+        print('#' * 20, f'Batch {batch_ix}', '#' * 20)
+        for k, v in batch.items():
+            print(k, v.shape, v.dtype)
+        for sample_ix, token_sample in enumerate(batch['input_ids']):
+            print('-' * 20, f' Sample {sample_ix} ', '-' * 20)
+            print(tokenizer.decode(token_sample))

Perceptrix/finetune/build/lib/llmfoundry/models/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM,
+                                  ComposerHFT5)
+from llmfoundry.models.mpt import (ComposerMPTCausalLM, MPTConfig,
+                                   MPTForCausalLM, MPTModel, MPTPreTrainedModel)
+__all__ = [
+    'ComposerHFCausalLM',
+    'ComposerHFPrefixLM',
+    'ComposerHFT5',
+    'MPTConfig',
+    'MPTPreTrainedModel',
+    'MPTModel',
+    'MPTForCausalLM',
+    'ComposerMPTCausalLM',
+]

Perceptrix/finetune/build/lib/llmfoundry/models/hf/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
+from llmfoundry.models.hf.hf_fsdp import (prepare_hf_causal_lm_model_for_fsdp,
+                                          prepare_hf_enc_dec_model_for_fsdp,
+                                          prepare_hf_model_for_fsdp)
+from llmfoundry.models.hf.hf_prefix_lm import ComposerHFPrefixLM
+from llmfoundry.models.hf.hf_t5 import ComposerHFT5
+__all__ = [
+    'ComposerHFCausalLM',
+    'ComposerHFPrefixLM',
+    'ComposerHFT5',
+    'prepare_hf_causal_lm_model_for_fsdp',
+    'prepare_hf_enc_dec_model_for_fsdp',
+    'prepare_hf_model_for_fsdp',
+]

Perceptrix/finetune/build/lib/llmfoundry/models/hf/hf_causal_lm.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Implements a Hugging Causal LM wrapped inside a :class:`.ComposerModel`."""
+import logging
+import os
+from typing import Mapping, Union
+# required for loading a python model into composer
+import transformers
+from composer.metrics.nlp import (InContextLearningCodeEvalAccuracy,
+                                  InContextLearningLMAccuracy,
+                                  InContextLearningLMExpectedCalibrationError,
+                                  InContextLearningMCExpectedCalibrationError,
+                                  InContextLearningMultipleChoiceAccuracy,
+                                  InContextLearningQAAccuracy,
+                                  LanguageCrossEntropy, LanguagePerplexity)
+from composer.utils import dist
+from omegaconf import DictConfig
+from torch import nn
+from transformers import (AutoConfig, AutoModelForCausalLM,
+                          PreTrainedTokenizerBase)
+from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
+from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
+from llmfoundry.models.layers.llama_attention_monkeypatch import \
+    get_llama_attention_patch_fn
+from llmfoundry.models.utils import init_empty_weights
+try:
+    from peft.peft_model import PeftModel
+    model_types = PeftModel, transformers.PreTrainedModel
+except ImportError:
+    model_types = transformers.PreTrainedModel
+__all__ = ['ComposerHFCausalLM']
+log = logging.getLogger(__name__)
+class ComposerHFCausalLM(HuggingFaceModelWithZLoss):
+    """Configures a :class:`.HuggingFaceModel` around a Causal LM.
+    Args:
+        om_model_config (DictConfig | PeftModel | transformers.PreTrainedModel): either an omegaconf dictionary used to configure the model, or an instantiated model object from the peft or transformers library.
+        if DictConfig, the following keys are required:
+            cfg.pretrained_model_name_or_path (str): The name of or local path to
+                the HF Causal LM (e.g., `gpt2` to instantiate a GPT2LMHeadModel).
+            cfg.config_overrides (dict, optional): An optional dictionary of keyword
+                arguments that override the default configuration associated with
+                cfg.pretrained_model_name_or_path.
+            cfg.pretrained (bool): Whether to instantiate the model with pre-trained
+                weights coming from cfg.pretrained_model_name_or_path. If ``True``,
+                cfg.config_overrides must be compatible with the pre-trained weights.
+            cfg.init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to
+                initialize the model on. Currently, `meta` is only supported when
+                cfg.pretrained is ``False``. Default: ``'cpu'``.
+        tokenizer (PreTrainedTokenizer): The tokenizer that the model will use.
+    """
+    def __init__(self, om_model_config: Union[DictConfig,
+                                              transformers.PreTrainedModel,
+                                              nn.Module],
+                 tokenizer: PreTrainedTokenizerBase):
+        # set up training and eval metrics
+        train_metrics = [LanguageCrossEntropy(), LanguagePerplexity()]
+        eval_metrics = [
+            LanguageCrossEntropy(),
+            LanguagePerplexity(),
+            InContextLearningLMAccuracy(),
+            InContextLearningMultipleChoiceAccuracy(),
+            InContextLearningQAAccuracy(),
+            InContextLearningCodeEvalAccuracy(),
+            InContextLearningLMExpectedCalibrationError(),
+            InContextLearningMCExpectedCalibrationError()
+        ]
+        # if we are passed a DictConfig, we need to instantiate the model
+        if isinstance(om_model_config, DictConfig):
+            if not om_model_config.get('trust_remote_code',
+                                       True) and om_model_config.get(
+                                           'pretrained_model_name_or_path',
+                                           None).startswith('mosaicml/mpt'):
+                raise ValueError(
+                    'trust_remote_code must be set to True for MPT models. Without this, the MPT model code will come from the transformers library, '
+                    +
+                    'which is not significantly slower and not compatible with the LLM foundry training code, rather than the code release by MosaicML.'
+                )
+            if not om_model_config.get('use_train_metrics', True):
+                train_metrics = []
+            # load the model config
+            trust_remote_code = om_model_config.get('trust_remote_code', True)
+            use_auth_token = om_model_config.get('use_auth_token', False)
+            config = AutoConfig.from_pretrained(
+                om_model_config.pretrained_model_name_or_path,
+                trust_remote_code=trust_remote_code,
+                use_auth_token=use_auth_token,
+            )
+            # set config overrides
+            for k, v in om_model_config.get('config_overrides', {}).items():
+                if not hasattr(config, k):
+                    raise ValueError(
+                        f'config does not have attribute "{k}" to override ({k}: {v}).'
+                    )
+                attr = getattr(config, k)
+                # attempt to disallow typos in nested configs
+                if isinstance(attr, Mapping):
+                    extra_keys = [
+                        _k for _k in v.keys() if _k not in attr.keys()
+                    ]
+                    if extra_keys:
+                        raise ValueError(
+                            f'Config dict override got unknown keys. ' +
+                            f'Extra keys: {extra_keys}. ' +
+                            f'Expected (a subset of) keys: {list(attr.keys())}.'
+                        )
+                    getattr(config, k).update(v)
+                # necessary case to allow for rope_scaling to be overriden in llama config
+                elif attr is None and isinstance(v, Mapping):
+                    setattr(config, k, {})
+                    getattr(config, k).update(v)
+                else:
+                    setattr(config, k, v)
+            load_in_8bit = om_model_config.get('load_in_8bit', False)
+            # below we set up the device to initialize the model on
+            init_device = om_model_config.get('init_device', 'cpu')
+            # Get the device we want to initialize, and use the
+            # reolved version to initialize the HF model
+            resolved_init_device = hf_get_init_device(init_device)
+            # We need to have all non-zero local ranks be not-pretrained
+            # Rank 0 will still be pretrained, and distribute the weights appropriately
+            if dist.get_local_rank() != 0 and init_device == 'mixed':
+                om_model_config.pretrained = False
+            # initialize the model on the correct device
+            if resolved_init_device == 'cpu':
+                if om_model_config.pretrained:
+                    model = AutoModelForCausalLM.from_pretrained(
+                        om_model_config.pretrained_model_name_or_path,
+                        trust_remote_code=trust_remote_code,
+                        use_auth_token=use_auth_token,
+                        load_in_8bit=load_in_8bit,
+                        config=config)
+                else:
+                    model = AutoModelForCausalLM.from_config(
+                        config,
+                        trust_remote_code=trust_remote_code,
+                    )
+            elif resolved_init_device == 'meta':
+                if om_model_config.pretrained:
+                    raise ValueError(
+                        'Setting cfg.pretrained=True is not supported when init_device="meta".'
+                    )
+                with init_empty_weights(include_buffers=False):
+                    model = AutoModelForCausalLM.from_config(
+                        config,
+                        trust_remote_code=trust_remote_code,
+                    )
+            else:
+                raise ValueError(
+                    f'init_device="{init_device}" must be either "cpu" or "meta".'
+                )
+            signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed'
+            if dist.get_local_rank() == 0:
+                with open(signal_file_path, 'wb') as f:
+                    f.write(b'local_rank0_completed_download')
+            # Avoid the collective call until the local rank zero has finished trying to download the checkpoint
+            # so that we don't timeout for large downloads. This syncs all processes on the node
+            with dist.local_rank_zero_download_and_wait(signal_file_path):
+                # Then, wait to ensure every node has finished downloading the checkpoint
+                dist.barrier()
+            if dist.get_local_rank() == 0:
+                os.remove(signal_file_path)
+            z_loss = om_model_config.get('z_loss', 0.0)
+            attention_patch_type = om_model_config.get('attention_patch_type',
+                                                       None)
+            if attention_patch_type is not None:
+                if model.config.model_type != 'llama':
+                    raise ValueError(
+                        f'attention_patch_type is only supported for llama models, but got {model.config.model_type}'
+                    )
+                log.debug(
+                    f'Patching llama attention with {attention_patch_type} attention'
+                )
+                from transformers.models.llama.modeling_llama import \
+                    LlamaAttention
+                LlamaAttention.forward = get_llama_attention_patch_fn(
+                    attention_patch_type)
+                model.config.use_cache = False
+        # elif the model is either a PeftModel or a PreTrainedModel
+        elif isinstance(om_model_config, model_types):
+            model = om_model_config
+            init_device = 'cpu'
+            z_loss = 0.0
+        # else, unsupported type
+        else:
+            raise ValueError(
+                f'om_model_config must be either a DictConfig, PeftModel, or PreTrainedModel, but got {type(om_model_config)}'
+            )
+        composer_model = super().__init__(model=model,
+                                          shift_labels=True,
+                                          tokenizer=tokenizer,
+                                          metrics=train_metrics,
+                                          eval_metrics=eval_metrics,
+                                          z_loss=z_loss,
+                                          init_device=init_device)
+        return composer_model

Perceptrix/finetune/build/lib/llmfoundry/models/hf/hf_fsdp.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+# helper functions from https://github.com/CarperAI/trlx/blob/main/trlx/utils/modeling.py
+# which is MIT licensed
+import functools
+from typing import Any, Iterable, List, Optional
+import torch
+from transformers import PreTrainedModel
+from transformers.models.opt.modeling_opt import OPTDecoder
+# helper functions
+def rhasattr(obj: Any, attr: str) -> bool:
+    """A chain-able attribute version of hasattr.
+    For example, to check if
+    `obj` has the attribute `foo.bar.baz`, you can use:
+        `rhasattr(obj, "foo.bar.baz")`
+    Reference: https://stackoverflow.com/a/67303315
+    """
+    _nested_attrs = attr.split('.')
+    _curr_obj = obj
+    for _a in _nested_attrs[:-1]:
+        if hasattr(_curr_obj, _a):
+            _curr_obj = getattr(_curr_obj, _a)
+        else:
+            return False
+    return hasattr(_curr_obj, _nested_attrs[-1])
+def rgetattr(obj: Any, attr: str, *args: List[Any]) -> Any:
+    """A chain-able attribute version of getattr.
+    For example, to get the attribute `foo.bar.baz` from `obj`, you can use:
+        `rgetattr(obj, "foo.bar.baz")`
+    Reference: https://stackoverflow.com/a/31174427
+    """
+    def _getattr(obj: Any, attr: str):
+        return getattr(obj, attr, *args)
+    return functools.reduce(_getattr, [obj] + attr.split('.'))
+def findattr(obj: Any, attrs: Iterable[str]) -> Optional[Any]:
+    for attr in attrs:
+        if rhasattr(obj, attr):
+            return rgetattr(obj, attr)
+    return None
+def hf_get_causal_base_model(model: PreTrainedModel) -> Any:
+    """Returns the causal decoder backbone of the specified HuggingFace model.
+    Newer HF models have a `self.get_decoder()` method. Older models do not.
+    NOTE: Different model configurations have different causal decoder attribute
+    names.
+        - transformer: (GPT2LMHeadModel, GPTJConfig)
+        - model.decoder: (OPTConfig, BloomConfig)
+        - gpt_neox: (GPTNeoXConfig)
+    """
+    if hasattr(model, 'get_decoder'):
+        return model.get_decoder()
+    decoder_attrs = ('transformer', 'model.decoder', 'gpt_neox')
+    causal_base_model = findattr(model, decoder_attrs)
+    if causal_base_model is None:
+        raise ValueError(
+            f'Unable to FSDP-wrap model {model}. Please open a github issue to add support.'
+        )
+    return causal_base_model
+def hf_get_hidden_layers(model: PreTrainedModel) -> Any:
+    """Returns the hidden layers of the specified model.
+    NOTE: Different model configurations have different hidden layer attribute names.
+        - transformer.h: (BloomForCausalLM, GPT2LMHeadModel, GPTJForCausalLM)
+        - model.decoder.layers: (OPTForCausalLM)
+        - gpt_neox.layers: (GPTNeoXForCausalLM)
+        - model.layers: (LlaMaForCausalLM)
+        - transformer.blocks: (MPTForCausalLM)
+    """
+    hidden_layers_attrs = (
+        'transformer.h',  # BLOOM, GPT2, GPTJ
+        'model.decoder.layers',  # OPT
+        'gpt_neox.layers',  # GPTNeoX
+        'block',  # T5, BART, Pegasus (from encoder)
+        'layers',  # ProphetNet, Marian (from encoder)
+        'model.layers',  # LLaMa
+        'transformer.blocks',  # MPT
+    )
+    layers = findattr(model, hidden_layers_attrs)
+    if layers is None:
+        raise ValueError(
+            f'Unable to find hidden layer for {model}. Model must have one of the following attributes: {hidden_layers_attrs}'
+        )
+    return layers
+def hf_get_init_device(init_device: Optional[str]) -> Optional[str]:
+    """Returns the appropriate device to initialize models."""
+    from composer.utils import dist
+    if init_device == 'mixed':
+        if dist.get_local_rank() == 0:
+            return 'cpu'
+        return 'meta'
+    return init_device
+# /end helper functions
+def prepare_hf_model_for_fsdp(model: PreTrainedModel,
+                              init_device: Optional[str]) -> None:
+    """FSDP wrap a HuggingFace model.
+    Call specific functions
+    """
+    if model.config.is_encoder_decoder:
+        prepare_hf_enc_dec_model_for_fsdp(model, init_device)
+    else:
+        # many common decoder-only model do not set the flag
+        # model.config.is_decoder, so we can't trust it
+        prepare_hf_causal_lm_model_for_fsdp(model, init_device)
+def prepare_hf_causal_lm_model_for_fsdp(model: PreTrainedModel,
+                                        init_device: Optional[str]) -> None:
+    """FSDP wrap a HuggingFace decoder.
+    Wrap any model for FSDP which follows one of the 3 existing conventions from
+    HuggingFace for decoder-only LLMs.
+    """
+    causal_base_model = hf_get_causal_base_model(model)
+    # OPT has an extra layer of wrapping, so special case here
+    if isinstance(causal_base_model, OPTDecoder):
+        model.model._fsdp_wrap = False
+    model_block = hf_get_hidden_layers(model)
+    lm_head = model.get_output_embeddings()
+    # some models (OPT) implement .get_input_embeddings for the causal subclass
+    # but all of them implement it for the base model
+    tied_embeddings = causal_base_model.get_input_embeddings()
+    modules = {
+        'base_model': causal_base_model,
+        'model_block': model_block,
+        'lm_head': lm_head,
+        'tied_embeddings': tied_embeddings
+    }
+    for mod_name, module in modules.items():
+        if module is None:
+            raise ValueError(
+                f'Unable to FSDP-wrap this model! `{mod_name}` does not ' +
+                'follow common layer/weight naming conventions.')
+    block_type = type(model_block[0])
+    if init_device == 'mixed':
+        # For FSDP with models with different device initializations, `mixed`, which
+        # initializes the model on rank 0 on `cpu` and on all other ranks on `meta,``
+        # we need to tag all child modules that are torch.nn.Modules with `_fsdp_wrap`.
+        for child in model.children():
+            if isinstance(child, type(causal_base_model)):
+                continue
+            if isinstance(child, torch.nn.Module):
+                child._fsdp_wrap = True
+        for child in causal_base_model.children():
+            if isinstance(child, torch.nn.ModuleList):
+                continue
+            if isinstance(child, torch.nn.Module):
+                child._fsdp_wrap = True
+        if model.config.tie_word_embeddings and not model.config.model_type == 'mpt':
+            raise ValueError(
+                'The passed in HuggingFaceModel has tied word embeddings ' +
+                'and the passed in initialization device is `mixed.` ' +
+                'In order to support this initialization scheme, we would need to break '
+                +
+                'the weight tying. As a result, either use a different initialization scheme '
+                + 'or in the model config set `tie_word_embeddings=False.`')
+    else:
+        # When using the HF LM models,
+        # the weights of the self.lm_head and self.transformer.wte are tied.
+        # This tying occurs inside the `self.post_init()` function.
+        # This is a hurdle for FSDP because they need to be in the same FSDP block
+        # These lines ensures that both modules stay together in the top-most block when
+        # the model has this tying enabled (almost all do; this property defaults to True)
+        if model.config.tie_word_embeddings:
+            causal_base_model._fsdp_wrap = False
+            tied_embeddings._fsdp_wrap = False
+            lm_head._fsdp_wrap = False
+    # FSDP Wrap and Activation Checkpoint every model block
+    model.fsdp_wrap_fn = lambda module: isinstance(module, block_type)
+    model.activation_checkpointing_fn = lambda module: isinstance(
+        module, block_type)
+def prepare_hf_enc_dec_model_for_fsdp(model: PreTrainedModel,
+                                      init_device: Optional[str]) -> None:
+    """Wrap an encoder/decoder HF model.
+    This works for T5, BART, Pegasus, PegasusX, but not all enc/dec (ProphetNet)
+    You have model.shared, model.encoder, model.decoder and model.lm_head, where
+    model.shared are the embeddings which are tied to model.lm_head, and
+    model.shared == model.encoder.embed_tokens and model.shared ==
+    model.decoder.embed_tokens
+    """
+    tied_embeddings = model.get_input_embeddings()
+    encoder = model.get_encoder()
+    decoder = model.get_decoder()
+    lm_head = model.get_output_embeddings()
+    # some encoder/decoders have different layers for encoder vs decoder
+    encoder_block = hf_get_hidden_layers(encoder)
+    decoder_block = hf_get_hidden_layers(decoder)
+    modules = {
+        'encoder': encoder,
+        'decoder': decoder,
+        'encoder_block': encoder_block,
+        'decoder_block': decoder_block,
+        'lm_head': lm_head,
+        'tied_embeddings': tied_embeddings
+    }
+    for mod_name, module in modules.items():
+        if module is None:
+            raise ValueError(
+                f'Unable to FSDP-wrap this model! `{mod_name}` does not ' +
+                'follow common layer/weight naming conventions.')
+    decoder_block_type = type(decoder_block[0])
+    encoder_block_type = type(encoder_block[0])
+    if model.config.tie_word_embeddings:
+        # it is possible to train an enc/dec without tied embeddings, hence the check
+        tied_embeddings._fsdp_wrap = False
+        encoder._fsdp_wrap = False
+        decoder._fsdp_wrap = False
+        lm_head._fsdp_wrap = False
+    # FSDP Wrap and Activation Checkpoint every decoder block
+    model.fsdp_wrap_fn = lambda module: isinstance(module, decoder_block_type)
+    model.activation_checkpointing_fn = lambda module: isinstance(
+        module, decoder_block_type)
+    if encoder_block_type == decoder_block_type:
+        return
+    # need to wrap encoder blocks separately for ProhpetNet and Marian
+    model.fsdp_wrap_fn = lambda module: isinstance(module, encoder_block_type)
+    model.activation_checkpointing_fn = lambda module: isinstance(
+        module, encoder_block_type)

Perceptrix/finetune/build/lib/llmfoundry/models/hf/hf_prefix_lm.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Implements a Hugging Prefix LM wrapped inside a :class:`.ComposerModel`."""
+from __future__ import annotations
+from typing import Mapping, MutableMapping
+from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy
+from composer.utils import dist
+from omegaconf import DictConfig
+from transformers import (AutoConfig, AutoModelForCausalLM,
+                          PreTrainedTokenizerBase)
+from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
+from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
+from llmfoundry.models.utils import (adapt_tokenizer_for_denoising,
+                                     add_bidirectional_mask_if_missing,
+                                     convert_hf_causal_lm_to_prefix_lm,
+                                     init_empty_weights)
+__all__ = ['ComposerHFPrefixLM']
+# HuggingFace hardcodes the ignore index to -100
+_HF_IGNORE_INDEX = -100
+class ComposerHFPrefixLM(HuggingFaceModelWithZLoss):
+    """Configures a :class:`.HuggingFaceModel` around a Prefix LM.
+    Note: HuggingFace does not natively support Prefix LM-style models. This function uses
+    `transformers.AutoModelForCausalLM` to instantiate a Causal LM, then uses a conversion utility
+    to turn the model into a Prefix LM. Currently, that conversion utility only supports the
+    following HuggingFace Causal LM types:
+        - `GPT2LMHeadModel`
+        - `GPTNeoForCausalLM`
+        - `GPTNeoXForCausalLM`
+        - `GPTJForCausalLM`
+        - `BloomForCausalLM`
+        - `OPTForCausalLM`
+    Args:
+        cfg (DictConfig): An omegaconf dictionary used to configure the model:
+            cfg.pretrained_model_name_or_path (str): The name of or local path to
+                the HF model (e.g., `gpt2` to instantiate a GPT2LMHeadModel). The model
+                will be converted to a Prefix LM during initialization.
+            cfg.config_overrides (dict, optional): An optional dictionary of keyword
+                arguments that override the default configuration associated with
+                cfg.pretrained_model_name_or_path. Default: ``{}``.
+            cfg.pretrained (bool): Whether to instantiate the model with pre-trained
+                weights coming from cfg.pretrained_model_name_or_path. If ``True``,
+                cfg.config_overrides must be compatible with the pre-trained weights.
+            cfg.init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to
+                initialize the model on. Currently, `meta` is only supported when
+                cfg.pretrained is ``False``. Default: ``'cpu'``.
+            cfg.z_loss (float, optional): The coefficient of the z-loss. If >0.0, this
+                the z-loss will be multiplied by this value before being added to the
+                standard loss term. Default: ``0.0``.
+            cfg.adapt_vocab_for_denoising (bool, optional):  Whether to adapt the vocab
+                of the model/tokenizer to include sentinel tokens that are used in denoising
+                tasks like Span Corruption. If you intend to load from an existing Composer
+                checkpoint that was trained on such a task, set this to ``True`` to ensure
+                that the model vocab size matches your checkpoint's vocab size when loading
+                the weights. Default: ``False``.
+        tokenizer (PreTrainedTokenizer): The tokenizer that the model will use.
+    """
+    def __init__(self, om_model_config: DictConfig,
+                 tokenizer: PreTrainedTokenizerBase):
+        config = AutoConfig.from_pretrained(
+            om_model_config.pretrained_model_name_or_path,
+            trust_remote_code=om_model_config.get('trust_remote_code', True),
+            use_auth_token=om_model_config.get('use_auth_token', False),
+        )
+        # set config overrides
+        for k, v in om_model_config.get('config_overrides', {}).items():
+            if not hasattr(config, k):
+                raise ValueError(
+                    f'config does not have attribute "{k}" to override ({k}: {v}).'
+                )
+            attr = getattr(config, k)
+            if isinstance(attr, Mapping):
+                extra_keys = [_k for _k in v.keys() if _k not in attr.keys()]
+                if extra_keys:
+                    raise ValueError(
+                        f'Config dict override got unknown keys. ' +
+                        f'Extra keys: {extra_keys}. ' +
+                        f'Expected (a subset of) keys: {list(attr.keys())}.')
+                getattr(config, k).update(v)
+            else:
+                setattr(config, k, v)
+        # Set up the tokenizer (add tokens for denoising sentinels if needed)
+        if om_model_config.get('adapt_vocab_for_denoising', False):
+            adapt_tokenizer_for_denoising(tokenizer)
+        init_device = om_model_config.get('init_device', 'cpu')
+        # Get the device we want to initialize, and use the
+        # resolved version to initialize the HF model
+        resolved_init_device = hf_get_init_device(init_device)
+        # We need to have all non-zero local ranks be not-pretrained
+        # Rank 0 will still be pretrained, and distribute the weights appropriately
+        if dist.get_local_rank() != 0 and init_device == 'mixed':
+            om_model_config.pretrained = False
+        if resolved_init_device == 'cpu':
+            if om_model_config.pretrained:
+                model = AutoModelForCausalLM.from_pretrained(
+                    om_model_config.pretrained_model_name_or_path,
+                    config=config)
+            else:
+                model = AutoModelForCausalLM.from_config(config)
+        elif resolved_init_device == 'meta':
+            if om_model_config.pretrained:
+                raise ValueError(
+                    'Setting cfg.pretrained=True is not supported when init_device="meta".'
+                )
+            with init_empty_weights(include_buffers=False):
+                model = AutoModelForCausalLM.from_config(config)
+        else:
+            raise ValueError(
+                f'init_device="{init_device}" must be either "cpu" or "meta".')
+        # Convert the Causal LM into a Prefix LM via our custom wrapper
+        model = convert_hf_causal_lm_to_prefix_lm(model)
+        metrics = [
+            LanguageCrossEntropy(ignore_index=_HF_IGNORE_INDEX),
+            MaskedAccuracy(ignore_index=_HF_IGNORE_INDEX)
+        ]
+        composer_model = super().__init__(model=model,
+                                          shift_labels=True,
+                                          tokenizer=tokenizer,
+                                          metrics=metrics,
+                                          z_loss=om_model_config.get(
+                                              'z_loss', 0.0),
+                                          init_device=init_device)
+        return composer_model
+    def forward(self, batch: MutableMapping):
+        # Add bidirectional_mask if it is missing and can be constructed
+        add_bidirectional_mask_if_missing(batch)
+        return super().forward(batch)

Perceptrix/finetune/build/lib/llmfoundry/models/hf/hf_t5.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Implements a Hugging Face T5 wrapped inside a :class:`.ComposerModel`."""
+from __future__ import annotations
+from typing import Mapping
+from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy
+from composer.utils import dist
+from omegaconf import DictConfig
+from transformers import (AutoConfig, PreTrainedTokenizerBase,
+                          T5ForConditionalGeneration)
+from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
+from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
+from llmfoundry.models.utils import (adapt_tokenizer_for_denoising,
+                                     init_empty_weights)
+__all__ = ['ComposerHFT5']
+# HuggingFace hardcodes the ignore index to -100
+_HF_IGNORE_INDEX = -100
+class ComposerHFT5(HuggingFaceModelWithZLoss):
+    """Configures a :class:`.HuggingFaceModel` around a T5.
+    Note: This function uses `transformers.T5ForConditionalGeneration`. Future releases
+        will expand support to more general classes of HF Encoder-Decoder models.
+    Args:
+        cfg (DictConfig): An omegaconf dictionary used to configure the model:
+            cfg.pretrained_model_name_or_path (str): The name of or local path to
+                the HF model (e.g., `t5-base` to instantiate a T5 using the base config).
+            cfg.config_overrides (dict, optional): An optional dictionary of keyword
+                arguments that override the default configuration associated with
+                cfg.pretrained_model_name_or_path. Default: ``{}``.
+            cfg.pretrained (bool): Whether to instantiate the model with pre-trained
+                weights coming from cfg.pretrained_model_name_or_path. If ``True``,
+                cfg.config_overrides must be compatible with the pre-trained weights.
+            cfg.init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to
+                initialize the model on. Currently, `meta` is only supported when
+                cfg.pretrained is ``False``. Default: ``'cpu'``.
+            cfg.z_loss (float, optional): The coefficient of the z-loss. If >0.0, this
+                the z-loss will be multiplied by this value before being added to the
+                standard loss term. Default: ``0.0``.
+            cfg.adapt_vocab_for_denoising (bool, optional):  Whether to adapt the vocab
+                of the model/tokenizer to include sentinel tokens that are used in denoising
+                tasks like Span Corruption. If you intend to load from an existing Composer
+                checkpoint that was trained on such a task, set this to ``True`` to ensure
+                that the model vocab size matches your checkpoint's vocab size when loading
+                the weights. Default: ``False``.
+        tokenizer (PreTrainedTokenizer): The tokenizer that the model will use.
+    """
+    def __init__(self, om_model_config: DictConfig,
+                 tokenizer: PreTrainedTokenizerBase):
+        config = AutoConfig.from_pretrained(
+            om_model_config.pretrained_model_name_or_path,
+            trust_remote_code=om_model_config.get('trust_remote_code', True),
+            use_auth_token=om_model_config.get('use_auth_token', False),
+        )
+        # set config overrides
+        for k, v in om_model_config.get('config_overrides', {}).items():
+            if not hasattr(config, k):
+                raise ValueError(
+                    f'config does not have attribute "{k}" to override ({k}: {v}).'
+                )
+            attr = getattr(config, k)
+            if isinstance(attr, Mapping):
+                extra_keys = [_k for _k in v.keys() if _k not in attr.keys()]
+                if extra_keys:
+                    raise ValueError(
+                        f'Config dict override got unknown keys. ' +
+                        f'Extra keys: {extra_keys}. ' +
+                        f'Expected (a subset of) keys: {list(attr.keys())}.')
+                getattr(config, k).update(v)
+            else:
+                setattr(config, k, v)
+        if not config.is_encoder_decoder:
+            raise ValueError(f'Model type "hf_t5" currently only supports T5 models ' +\
+                             f'using configs where `is_encoder_decoder` is ``True``.')
+        # Set up the tokenizer (add tokens for denoising sentinels if needed)
+        if om_model_config.get('adapt_vocab_for_denoising', False):
+            adapt_tokenizer_for_denoising(tokenizer)
+        init_device = om_model_config.get('init_device', 'cpu')
+        # Get the device we want to initialize, and use the
+        # resolved version to initialize the HF model
+        resolved_init_device = hf_get_init_device(init_device)
+        # We need to have all non-zero local ranks be not-pretrained
+        # Rank 0 will still be pretrained, and distribute the weights appropriately
+        if dist.get_local_rank() != 0 and init_device == 'mixed':
+            om_model_config.pretrained = False
+        if resolved_init_device == 'cpu':
+            if om_model_config.pretrained:
+                model = T5ForConditionalGeneration.from_pretrained(
+                    om_model_config.pretrained_model_name_or_path,
+                    config=config)
+            else:
+                model = T5ForConditionalGeneration(config)
+        elif resolved_init_device == 'meta':
+            if om_model_config.pretrained:
+                raise ValueError(
+                    'Setting cfg.pretrained=True is not supported when init_device="meta".'
+                )
+            with init_empty_weights(include_buffers=False):
+                model = T5ForConditionalGeneration(config)
+        else:
+            raise ValueError(
+                f'init_device="{init_device}" must be either "cpu" or "meta".')
+        metrics = [
+            LanguageCrossEntropy(ignore_index=_HF_IGNORE_INDEX),
+            MaskedAccuracy(ignore_index=_HF_IGNORE_INDEX)
+        ]
+        composer_model = super().__init__(model=model,
+                                          tokenizer=tokenizer,
+                                          metrics=metrics,
+                                          z_loss=om_model_config.get(
+                                              'z_loss', 0.0),
+                                          init_device=init_device)
+        return composer_model

Perceptrix/finetune/build/lib/llmfoundry/models/hf/model_wrapper.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Re-usable :class:`.ComposerModel` for LLM HF Models."""
+from __future__ import annotations
+import inspect
+from collections import UserDict
+from typing import List, Mapping, Optional
+import torch
+import transformers
+from composer.models.huggingface import HuggingFaceModel
+from torchmetrics import Metric
+from transformers import PreTrainedTokenizerBase
+from transformers.utils.generic import ModelOutput
+from llmfoundry.models.hf.hf_fsdp import prepare_hf_model_for_fsdp
+# HuggingFace hardcodes the ignore index to -100
+_HF_IGNORE_INDEX = -100
+class HuggingFaceModelWithZLoss(HuggingFaceModel):
+    """Wrapper around HuggingFaceModel.
+    This adds z-loss, which is used in some training contexts,
+    and is a convenient way to patch features that are generically
+    useful for HF models.
+    See use of z_loss in PaLM: https://arxiv.org/abs/2204.02311v3, Section 5.
+    Also, from https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666:
+        Two uses of z_loss are:
+        - To keep the logits from drifting too far from zero, which can cause
+            unacceptable roundoff errors in bfloat16.
+        - To encourage the logits to be normalized log-probabilities.
+    Handles preparation for FSDP wrapping.
+    """
+    def __init__(self,
+                 model: transformers.PreTrainedModel,
+                 tokenizer: Optional[PreTrainedTokenizerBase] = None,
+                 metrics: Optional[List[Metric]] = None,
+                 eval_metrics: Optional[List[Metric]] = None,
+                 z_loss: float = 0.0,
+                 shift_labels: bool = False,
+                 init_device: Optional[str] = None):
+        super().__init__(model,
+                         tokenizer,
+                         use_logits=True,
+                         metrics=metrics,
+                         eval_metrics=eval_metrics,
+                         shift_labels=shift_labels)
+        self.z_loss = float(z_loss)
+        if self.z_loss < 0.0:
+            raise ValueError(f'z_loss(={z_loss}) cannot be negative.')
+        self.model_forward_args = inspect.getfullargspec(
+            self.model.forward).args
+        # inspect.getfullargspec HuggingFace quantized model could not return args correctly
+        if not self.model_forward_args:
+            self.model_forward_args = inspect.signature(
+                self.model.forward).parameters.keys()
+        # Note: We need to add the FSDP related attributes to the model AFTER the super init,
+        # so that the (possible) embedding resizing doesn't destroy them
+        prepare_hf_model_for_fsdp(self.model, init_device)
+        # This provides support for meta initialization when using FSDP
+        self.model.param_init_fn = lambda module: self.model._init_weights(
+            module)
+    def forward(self, batch: Mapping):
+        if isinstance(batch, dict) or isinstance(batch, UserDict):
+            # Further input validation is left to the huggingface forward call
+            batch = {
+                k: v for k, v in batch.items() if k in self.model_forward_args
+            }
+            output = self.model(**batch)  # type: ignore (thirdparty)
+        else:
+            raise ValueError(
+                'Unexpected batch type. Expected a dictionary with keys corresponding to the inputs to the forward function of the Huggingface model'
+            )
+        return output
+    def loss(self, outputs: ModelOutput, batch: Mapping):
+        if self.config.use_return_dict:
+            loss, logits = outputs['loss'], outputs['logits']
+        else:
+            # loss is at index 0 in the output tuple, logits are at index 1
+            loss, logits = outputs[:2]
+        if self.z_loss == 0.0:
+            return loss
+        # Add a z_loss to the standard loss
+        logits_flat = logits.view(-1, logits.size(-1))
+        labels_flat = batch['labels'].view(-1)
+        log_z = torch.logsumexp(logits_flat[labels_flat != _HF_IGNORE_INDEX],
+                                dim=1)
+        log_z2 = log_z**2
+        z_loss = log_z2.mean() * self.z_loss
+        if self.config.use_return_dict:
+            outputs['loss'] += z_loss
+            return outputs['loss']
+        else:
+            outputs[0] += z_loss
+            return outputs[0]

Perceptrix/finetune/build/lib/llmfoundry/models/inference_api_wrapper/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from llmfoundry.models.inference_api_wrapper.interface import \
+    InferenceAPIEvalWrapper
+from llmfoundry.models.inference_api_wrapper.openai_causal_lm import (
+    OpenAICausalLMEvalWrapper, OpenAIChatAPIEvalWrapper)
+__all__ = [
+    'OpenAICausalLMEvalWrapper',
+    'OpenAIChatAPIEvalWrapper',
+    'InferenceAPIEvalWrapper',
+]

Perceptrix/finetune/build/lib/llmfoundry/models/inference_api_wrapper/interface.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, Optional
+import torch
+from composer.core.types import Batch
+from composer.metrics import InContextLearningMetric
+from composer.metrics.nlp import (InContextLearningLMAccuracy,
+                                  InContextLearningLMExpectedCalibrationError,
+                                  InContextLearningMCExpectedCalibrationError,
+                                  InContextLearningMultipleChoiceAccuracy,
+                                  InContextLearningQAAccuracy,
+                                  LanguageCrossEntropy, LanguagePerplexity)
+from composer.models import ComposerModel
+from torchmetrics import Metric
+from transformers import AutoTokenizer
+class InferenceAPIEvalWrapper(ComposerModel):
+    def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
+        self.tokenizer = tokenizer
+        self.labels = None
+        # set up training and eval metrics
+        eval_metrics = [
+            LanguageCrossEntropy(),
+            LanguagePerplexity(),
+            InContextLearningLMAccuracy(),
+            InContextLearningMultipleChoiceAccuracy(),
+            InContextLearningQAAccuracy(),
+            InContextLearningLMExpectedCalibrationError(),
+            InContextLearningMCExpectedCalibrationError()
+        ]
+        self.eval_metrics = {
+            metric.__class__.__name__: metric for metric in eval_metrics
+        }
+        super().__init__()
+    def get_metrics(self, is_train: bool = False):
+        if is_train:
+            raise NotImplementedError(
+                'You cannot use inference wrappers for training')
+        else:
+            metrics = self.eval_metrics
+        return metrics if metrics else {}
+    def get_next_token_logit_tensor(self,
+                                    prompt: str) -> Optional[torch.Tensor]:
+        raise NotImplementedError
+    def rebatch(self, batch: Batch):
+        # default is a no-op, but Chat API modifies these
+        return batch
+    def eval_forward(self, batch: Batch, outputs: Optional[Any] = None):
+        # If the batch mode is generate, we will generate a requested number of tokens using the underlying
+        # model's generate function. Extra generation kwargs can be passed in via the batch. Strings will
+        # be returned from eval_forward
+        output_logits_batch = []
+        for tokens, cont_idxs in zip(batch['input_ids'],
+                                     batch['continuation_indices']):
+            seqlen = tokens.shape[0]
+            tokens = tokens.tolist()
+            cont_idxs = cont_idxs.tolist()
+            expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
+            output_logits = torch.nn.functional.one_hot(
+                torch.tensor(tokens[1:cont_idxs[0]]),
+                num_classes=self.tokenizer.vocab_size)
+            for i in range(len(expected_cont_tokens)):
+                # decode one token at a time
+                prompt = self.tokenizer.decode(tokens[:cont_idxs[0]] +
+                                               expected_cont_tokens[0:i])
+                next_logit_tensor = self.get_next_token_logit_tensor(prompt)
+                if next_logit_tensor is None:
+                    continue
+                output_logits = torch.cat(
+                    [output_logits,
+                     next_logit_tensor.reshape(1, -1)])
+            padding = torch.nn.functional.one_hot(
+                torch.full((seqlen - output_logits.shape[0],),
+                           self.tokenizer.pad_token_id),
+                num_classes=self.tokenizer.vocab_size)
+            output_logits = torch.cat([output_logits, padding])
+            output_logits_batch.append(output_logits)
+        return torch.stack(output_logits_batch).to(batch['input_ids'].device)
+    def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None:
+        batch = self.rebatch(batch)
+        self.labels = batch.pop('labels')
+        self.labels[:, :-1] = self.labels[:, 1:].clone()
+        self.labels[:, -1] = -100
+        if isinstance(metric, InContextLearningMetric) and batch.get(
+                'mode', None) == 'icl_task':
+            assert self.labels is not None
+            metric.update(batch, outputs, self.labels)
+        else:
+            raise NotImplementedError(
+                'Inference API wrapper only supports InContextLearningMetrics and mode=icl_task'
+            )
+    def forward(self):
+        raise NotImplementedError(
+            "Inference API wrapper doesn't support forward")
+    def loss(self):
+        raise NotImplementedError("Inference API wrapper doesn't support loss")

Perceptrix/finetune/build/lib/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Implements a OpenAI chat and causal LM inference API wrappers."""
+import logging
+import os
+from time import sleep
+from typing import Any, Dict, List, Optional, Union
+import torch
+from composer.core.types import Batch
+from composer.utils.import_helpers import MissingConditionalImportError
+from transformers import AutoTokenizer
+log = logging.getLogger(__name__)
+from llmfoundry.models.inference_api_wrapper.interface import \
+    InferenceAPIEvalWrapper
+__all__ = [
+    'OpenAICausalLMEvalWrapper',
+    'OpenAIChatAPIEvalWrapper',
+]
+MAX_RETRIES = 10
+class OpenAIEvalInterface(InferenceAPIEvalWrapper):
+    def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None:
+        super().__init__(model_cfg, tokenizer)
+        try:
+            import openai
+        except ImportError as e:
+            raise MissingConditionalImportError(
+                extra_deps_group='openai',
+                conda_package='openai',
+                conda_channel='conda-forge') from e
+        openai.api_key = os.getenv('OPENAI_API_KEY')
+        self.model_name = model_cfg['version']
+    def generate_completion(self, prompt: str, num_tokens: int):
+        raise NotImplementedError()
+    def process_result(self, completion: Optional[dict]):
+        raise NotImplementedError()
+    def get_next_token_logit_tensor(self, prompt: str, num_tokens: int = 1):
+        completion = self.try_generate_completion(prompt, num_tokens)
+        return self.process_result(completion)
+    def try_generate_completion(self, prompt: str, num_tokens: int):
+        try:
+            from openai.error import RateLimitError
+        except ImportError as e:
+            raise MissingConditionalImportError(
+                extra_deps_group='openai',
+                conda_package='openai',
+                conda_channel='conda-forge') from e
+        tries = 0
+        completion = None
+        while tries < MAX_RETRIES:
+            tries += 1
+            try:
+                completion = self.generate_completion(prompt, num_tokens)
+                break
+            except RateLimitError as e:
+                if 'You exceeded your current quota' in str(e._message):
+                    raise e
+                sleep(60)
+                continue
+            except Exception:
+                continue
+        return completion
+class OpenAIChatAPIEvalWrapper(OpenAIEvalInterface):
+    def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None:
+        super().__init__(model_cfg, tokenizer)
+        try:
+            import openai
+        except ImportError as e:
+            raise MissingConditionalImportError(
+                extra_deps_group='openai',
+                conda_package='openai',
+                conda_channel='conda-forge') from e
+        self.generate_completion = lambda prompt, num_tokens: openai.ChatCompletion.create(
+            self.model_name,
+            messages=[{
+                'role': 'user',
+                'content': prompt
+            }],
+            max_tokens=num_tokens,
+            temperature=0.0)
+    def retokenize(self, tokens: List[int], cont_idxs: List[int]):
+        """Chat API will never respond with a word-initial space.
+        If the continuation tokens begin with a word initial space, we need to
+        re-tokenize with the space removed.
+        """
+        original_len = len(tokens)
+        retokenized_continuation = self.tokenizer(
+            self.tokenizer.decode(tokens[cont_idxs[0]:cont_idxs[-1] +
+                                         1]).strip())['input_ids']
+        # replace the original continuation with the retokenized continuation + padding
+        padding = [tokens[-1]] * (
+            len(tokens) - len(tokens[:cont_idxs[0]] + retokenized_continuation))
+        tokens = tokens[:cont_idxs[0]] + retokenized_continuation + padding
+        if len(tokens) > original_len:
+            # this only happens if we were already at max seq len and the continuation got LARGER
+            tokens = tokens[-original_len:]
+            cont_idxs = list(
+                range(original_len - len(retokenized_continuation),
+                      original_len))
+        else:
+            cont_idxs = list(
+                range(cont_idxs[0],
+                      cont_idxs[0] + len(retokenized_continuation)))
+        return torch.tensor(tokens), torch.tensor(cont_idxs)
+    def rebatch(self, batch: Batch):
+        """Chat API tokenization has different behavior than GPT3.
+        Model responses will never begin with spaces even if the continuation is
+        expected to, so we need to retokenize the input to account for that.
+        """
+        new_batch: Dict[str, Union[List[torch.Tensor], torch.Tensor]] = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'labels': []
+        }
+        for tokens, cont_idxs in zip(batch['input_ids'],
+                                     batch['continuation_indices']):
+            tokens, cont_idxs = self.retokenize(tokens.tolist(),
+                                                cont_idxs.tolist())
+            assert isinstance(new_batch['input_ids'], list)
+            new_batch['input_ids'].append(tokens)
+            assert isinstance(new_batch['labels'], list)
+            new_batch['labels'].append(tokens)
+            assert isinstance(new_batch['continuation_indices'], list)
+            new_batch['continuation_indices'].append(cont_idxs)
+        new_batch.update({
+            k: torch.stack(new_batch[k])  # pyright: ignore
+            for k in ['input_ids', 'labels']
+        })
+        new_batch.update({k: v for k, v in batch.items() if k not in new_batch})
+        return new_batch
+    def eval_forward(self, batch: Batch, outputs: Optional[Any] = None):
+        # Override the base class because Chat's API always strips spacing from model outputs resulting in different tokens
+        # than what the continuation would expect.
+        # Get around this issue by retokenizing the batch to remove spacing from the continuation as well as
+        # decoding the whole continuation at once.
+        output_logits_batch = []
+        batch = self.rebatch(batch)
+        for tokens, cont_idxs in zip(batch['input_ids'],
+                                     batch['continuation_indices']):
+            seqlen = tokens.shape[0]
+            tokens = tokens.tolist()
+            cont_idxs = cont_idxs.tolist()
+            expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
+            output_logits = torch.nn.functional.one_hot(
+                torch.tensor(tokens[1:cont_idxs[0]]),
+                num_classes=self.tokenizer.vocab_size)
+            prompt = self.tokenizer.decode(tokens[:cont_idxs[0]])
+            next_logit_tensor = self.get_next_token_logit_tensor(
+                prompt, num_tokens=len(expected_cont_tokens))
+            if next_logit_tensor is not None:
+                output_logits = torch.cat([output_logits, next_logit_tensor])
+            padding = torch.nn.functional.one_hot(
+                torch.full((seqlen - output_logits.shape[0],),
+                           self.tokenizer.pad_token_id),
+                num_classes=self.tokenizer.vocab_size)
+            output_logits = torch.cat([output_logits, padding])
+            output_logits_batch.append(output_logits)
+        return torch.stack(output_logits_batch).to(batch['input_ids'].device)
+    def process_result(self, completion: Optional[dict]):
+        assert isinstance(completion, dict)
+        if len(completion['choices']) > 0:
+            tensors = []
+            for t in self.tokenizer(completion['choices'][0]['message']
+                                    ['content'])['input_ids']:
+                tensors.append(
+                    self.tokenizer.construct_logit_tensor(
+                        {self.tokenizer.decode([t]): 0.0}))
+            if len(tensors) == 0:
+                return None
+            return torch.stack(tensors)
+        else:
+            # the model sometimes stops early even though we are still requesting tokens!
+            # not sure if there's a fix
+            return None
+class OpenAICausalLMEvalWrapper(OpenAIEvalInterface):
+    def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None:
+        super().__init__(model_cfg, tokenizer)
+        try:
+            import openai
+        except ImportError as e:
+            raise MissingConditionalImportError(
+                extra_deps_group='openai',
+                conda_package='openai',
+                conda_channel='conda-forge') from e
+        self.generate_completion = lambda prompt, num_tokens: openai.Completion.create(
+            engine=self.model_name,
+            prompt=prompt,
+            max_tokens=1,
+            logprobs=5,
+            temperature=0.0)
+    def process_result(self, completion: Optional[dict]):
+        if completion is None:
+            raise ValueError("Couldn't generate model output")
+        assert isinstance(completion, dict)
+        if len(completion['choices'][0]['logprobs']['top_logprobs']) > 0:
+            tensor = self.tokenizer.construct_logit_tensor(
+                dict(completion['choices'][0]['logprobs']['top_logprobs'][0]))
+            return tensor
+        else:
+            # the model sometimes stops early even though we are still requesting tokens!
+            # not sure if there's a fix
+            return None

Perceptrix/finetune/build/lib/llmfoundry/models/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from llmfoundry.models.layers.attention import (
+    ATTN_CLASS_REGISTRY, MultiheadAttention, MultiQueryAttention,
+    attn_bias_shape, build_alibi_bias, build_attn_bias, flash_attn_fn,
+    scaled_multihead_dot_product_attention, triton_flash_attn_fn)
+from llmfoundry.models.layers.blocks import MPTBlock
+from llmfoundry.models.layers.custom_embedding import SharedEmbedding
+from llmfoundry.models.layers.fc import FC_CLASS_REGISTRY
+from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn
+from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY, LPLayerNorm
+__all__ = [
+    'scaled_multihead_dot_product_attention',
+    'flash_attn_fn',
+    'triton_flash_attn_fn',
+    'MultiheadAttention',
+    'MultiQueryAttention',
+    'attn_bias_shape',
+    'build_attn_bias',
+    'build_alibi_bias',
+    'ATTN_CLASS_REGISTRY',
+    'MPTMLP',
+    'MPTBlock',
+    'NORM_CLASS_REGISTRY',
+    'LPLayerNorm',
+    'FC_CLASS_REGISTRY',
+    'SharedEmbedding',
+    'FFN_CLASS_REGISTRY',
+    'build_ffn',
+]

Perceptrix/finetune/build/lib/llmfoundry/models/layers/attention.py ADDED Viewed

	@@ -0,0 +1,768 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layers."""
+import math
+import warnings
+from typing import Any, List, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import rearrange
+from packaging import version
+from torch import nn
+from llmfoundry.models.layers.fc import FC_CLASS_REGISTRY
+from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
+def is_flash_v2_installed():
+    try:
+        import flash_attn as flash_attn
+    except:
+        return False
+    return version.parse(flash_attn.__version__) >= version.parse('2.0.0')
+def is_flash_v1_installed():
+    try:
+        import flash_attn as flash_attn
+    except:
+        return False
+    return version.parse(flash_attn.__version__) < version.parse('2.0.0')
+def _reset_is_causal(num_query_tokens: int, num_key_tokens: int,
+                     original_is_causal: bool) -> bool:
+    # disable causal when it is not needed
+    # necessary for flash & triton for generation with kv_cache
+    if original_is_causal and num_query_tokens != num_key_tokens:
+        if num_query_tokens != 1:
+            raise NotImplementedError(
+                'MPT does not support query and key with different number of tokens, unless number of query tokens is 1.'
+            )
+        else:
+            return False
+    return original_is_causal
+def repeat_kv_for_gqa(hidden: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Perform repeat of kv heads along a particular dimension.
+    hidden.shape expected to be: (batch size, seq len, kv_n_heads, head_dim)
+    n_rep: amount of repetitions of kv_n_heads
+    Unlike torch.repeat_interleave, this function avoids allocating new memory.
+    """
+    if n_rep == 1:
+        return hidden
+    b, s, kv_n_heads, d = hidden.shape
+    hidden = hidden[:, :, :, None, :].expand(b, s, kv_n_heads, n_rep, d)
+    return hidden.reshape(b, s, kv_n_heads * n_rep, d)
+def scaled_multihead_dot_product_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    n_heads: int,
+    kv_n_heads: Optional[int] = None,
+    past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    softmax_scale: Optional[float] = None,
+    attn_bias: Optional[torch.Tensor] = None,
+    key_padding_mask: Optional[torch.Tensor] = None,
+    is_causal: bool = False,
+    dropout_p: float = 0.0,
+    training: bool = False,
+    needs_weights: bool = False,
+    multiquery: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor,
+                                                                torch.Tensor]]]:
+    if multiquery:
+        warnings.warn(
+            DeprecationWarning(
+                'The direct use of the multiquery arg is deprecated. Setting kv_n_heads=1 automatically. Please set kv_n_heads=1 explicitly to remove this warning.'
+            ))
+        kv_n_heads = 1
+    elif kv_n_heads is None:
+        warnings.warn(
+            DeprecationWarning(
+                'Not specifying a value for the kv_n_heads arg is deprecated. Setting kv_n_heads=n_heads automatically. Please set kv_n_heads=n_heads explicitly to remove this warning.'
+            ))
+        kv_n_heads = n_heads
+    q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
+    k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
+    v = rearrange(value, 'b s (h d) -> b h s d', h=kv_n_heads)
+    if past_key_value is not None:
+        # attn_impl: flash & triton use kernels which expect input shape [b, s, h, d_head].
+        # kv_cache is therefore stored using that shape.
+        # attn_impl: torch stores the kv_cache in the ordering which is most advantageous
+        # for its attn computation ie
+        # keys are stored as tensors with shape [b, h, d_head, s] and
+        # values are stored as tensors with shape [b, h, s, d_head]
+        if len(past_key_value) != 0:
+            k = torch.cat([past_key_value[0], k], dim=3)
+            v = torch.cat([past_key_value[1], v], dim=2)
+        past_key_value = (k, v)
+    b, _, s_q, d = q.shape
+    s_k = k.size(-1)
+    # grouped query case
+    if kv_n_heads > 1 and kv_n_heads < n_heads:
+        # necessary to do a transpose to swap (b h s d) -> (b s h d) for repeat_kv_for_gqa function
+        k = repeat_kv_for_gqa(k.transpose(1, 2),
+                              n_heads // kv_n_heads).transpose(1, 2)
+        v = repeat_kv_for_gqa(v.transpose(1, 2),
+                              n_heads // kv_n_heads).transpose(1, 2)
+    if softmax_scale is None:
+        softmax_scale = 1 / math.sqrt(d)
+    attn_weight = q.matmul(k) * softmax_scale
+    if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - s_q)
+        _s_k = max(0, attn_bias.size(3) - s_k)
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+        if (attn_bias.size(-1) != 1 and
+                attn_bias.size(-1) != s_k) or (attn_bias.size(-2) != 1 and
+                                               attn_bias.size(-2) != s_q):
+            raise RuntimeError(
+                f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.'
+            )
+        attn_weight = attn_weight + attn_bias
+    min_val = torch.finfo(q.dtype).min
+    if key_padding_mask is not None:
+        if attn_bias is not None:
+            warnings.warn(
+                'Propagating key_padding_mask to the attention module ' +\
+                'and applying it within the attention module can cause ' +\
+                'unnecessary computation/memory usage. Consider integrating ' +\
+                'into attn_bias once and passing that to each attention ' +\
+                'module instead.'
+            )
+        attn_weight = attn_weight.masked_fill(
+            ~key_padding_mask.view((b, 1, 1, s_k)), min_val)
+    if is_causal and (not q.size(2) == 1):
+        s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float32)
+        causal_mask = causal_mask.tril()
+        causal_mask = causal_mask.to(torch.bool)
+        causal_mask = ~causal_mask
+        causal_mask = causal_mask[-s_q:, -s_k:]
+        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k),
+                                              min_val)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if dropout_p:
+        attn_weight = torch.nn.functional.dropout(attn_weight,
+                                                  p=dropout_p,
+                                                  training=training,
+                                                  inplace=True)
+    out = attn_weight.to(v.dtype).matmul(v)
+    out = rearrange(out, 'b h s d -> b s (h d)')
+    if needs_weights:
+        return out, attn_weight, past_key_value
+    return out, None, past_key_value
+def check_valid_inputs(*tensors: torch.Tensor,
+                       valid_dtypes: Optional[List[torch.dtype]] = None):
+    if valid_dtypes is None:
+        valid_dtypes = [torch.float16, torch.bfloat16]
+    for tensor in tensors:
+        if tensor.dtype not in valid_dtypes:
+            raise TypeError(f'{tensor.dtype=} must be in {valid_dtypes=}.')
+        if not tensor.is_cuda:
+            raise TypeError(f'Inputs must be cuda tensors ({tensor.is_cuda=}).')
+def flash_attn_fn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    n_heads: int,
+    kv_n_heads: Optional[int] = None,
+    past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    softmax_scale: Optional[float] = None,
+    attn_bias: Optional[torch.Tensor] = None,
+    key_padding_mask: Optional[torch.Tensor] = None,
+    is_causal: bool = False,
+    dropout_p: float = 0.0,
+    training: bool = False,
+    needs_weights: bool = False,
+    multiquery: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor,
+                                                                torch.Tensor]]]:
+    try:
+        from flash_attn import bert_padding, flash_attn_interface  # type: ignore # yapf: disable # isort: skip
+    except:
+        raise RuntimeError(
+            'Please install flash-attn==1.0.9 or flash-attn==2.3.2')
+    check_valid_inputs(query, key, value)
+    if multiquery:
+        warnings.warn(
+            DeprecationWarning(
+                'The direct use of the multiquery arg is deprecated. Setting kv_n_heads=1 automatically. Please set kv_n_heads=1 explicitly to remove this warning.'
+            ))
+        kv_n_heads = 1
+    elif kv_n_heads is None:
+        warnings.warn(
+            DeprecationWarning(
+                'Not specifying a value for the kv_n_heads arg is deprecated. Setting kv_n_heads=n_heads automatically. Please set kv_n_heads=n_heads explicitly to remove this warning.'
+            ))
+        kv_n_heads = n_heads
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+    if attn_bias is not None:
+        raise NotImplementedError(f'attn_bias not implemented for flash attn.')
+    batch_size, seqlen = query.shape[:2]
+    if key_padding_mask is None:
+        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
+    query_padding_mask = key_padding_mask[:, -query.size(1):]
+    query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input(
+        query, query_padding_mask)
+    query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+    key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input(
+        key, key_padding_mask)
+    key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
+    value_unpad, _, _, _ = bert_padding.unpad_input(value, key_padding_mask)
+    value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
+    # multi-query case
+    if kv_n_heads == 1:
+        # Expanding a tensor does not allocate new memory, but only creates a new
+        # view on the existing tensor where a dimension of size one is expanded
+        # to a larger size by setting the stride to 0.
+        # - pytorch docs
+        #
+        # hopefully the kernels can utilize this and we're jot just wasting BW here
+        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads,
+                                     key_unpad.size(-1))
+        value_unpad = value_unpad.expand(value_unpad.size(0), n_heads,
+                                         value_unpad.size(-1))
+    # grouped query case
+    elif kv_n_heads < n_heads:
+        # Each query belong to a group of kv heads of group size n_heads // kv_n_heads
+        # We repeat each kv head by the group size number to use the underlying MHA kernels
+        # since repeat_kv_for_gqa expects input dims of (b, s, kv_n_heads, d)
+        # we use .view to modify {key, value}_unpad appropriately
+        key_unpad = repeat_kv_for_gqa(
+            key_unpad.view(batch_size, seqlen, kv_n_heads, -1),
+            n_heads // kv_n_heads).view(batch_size * seqlen, n_heads, -1)
+        value_unpad = repeat_kv_for_gqa(
+            value_unpad.view(batch_size, seqlen, kv_n_heads, -1),
+            n_heads // kv_n_heads).view(batch_size * seqlen, n_heads, -1)
+    dropout_p = dropout_p if training else 0.0
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    if is_flash_v1_installed():
+        output_unpad = flash_attn_interface.flash_attn_unpadded_func(
+            q=query_unpad,
+            k=key_unpad,
+            v=value_unpad,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=reset_is_causal,
+            return_attn_probs=needs_weights)
+    elif is_flash_v2_installed():
+        output_unpad = flash_attn_interface.flash_attn_varlen_func(
+            q=query_unpad,
+            k=key_unpad,
+            v=value_unpad,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=reset_is_causal,
+            return_attn_probs=needs_weights)
+    else:
+        raise RuntimeError(
+            'flash-attn==1.0.9 or flash-attn==2.3.2 is required.')
+    output = bert_padding.pad_input(
+        rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size,
+        seqlen)
+    return output, None, past_key_value
+def triton_flash_attn_fn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    n_heads: int,
+    kv_n_heads: Optional[int] = None,
+    past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    softmax_scale: Optional[float] = None,
+    attn_bias: Optional[torch.Tensor] = None,
+    key_padding_mask: Optional[torch.Tensor] = None,
+    is_causal: bool = False,
+    dropout_p: float = 0.0,
+    training: bool = False,
+    needs_weights: bool = False,
+    multiquery: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor,
+                                                                torch.Tensor]]]:
+    try:
+        from llmfoundry.models.layers.flash_attn_triton import flash_attn_func
+    except:
+        _installed = False
+        if version.parse(torch.__version__) < version.parse('2.0.0'):
+            _installed = True
+            # if torch1.13.1 revert to using triton flash attn from HazyResearch
+            # with flash-attn==1.0.9 and triton==2.0.0.dev20221202
+            try:
+                from flash_attn.flash_attn_triton import flash_attn_func
+            except:
+                _installed = False
+        if not _installed:
+            # installing triton-pre-mlir works for both torch1.13.1 and torch2.0+
+            # default recommendation is to install this variant
+            raise RuntimeError(
+                'Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU '
+                +
+                'and `pip install .[gpu]` if installing from llm-foundry source or '
+                +
+                '`pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` '
+                +
+                'if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). '
+                +
+                'Note: (1) requires you have CMake and PyTorch already installed.'
+            )
+    check_valid_inputs(query, key, value)
+    if multiquery:
+        warnings.warn(
+            DeprecationWarning(
+                'The direct use of the multiquery arg is deprecated. Setting kv_n_heads=1 automatically. Please set kv_n_heads=1 explicitly to remove this warning.'
+            ))
+        kv_n_heads = 1
+    elif kv_n_heads is None:
+        warnings.warn(
+            DeprecationWarning(
+                'Not specifying a value for the kv_n_heads arg is deprecated. Setting kv_n_heads=n_heads automatically. Please set kv_n_heads=n_heads explicitly to remove this warning.'
+            ))
+        kv_n_heads = n_heads
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+    if dropout_p:
+        raise NotImplementedError(
+            f'Dropout not implemented for attn_impl: triton.')
+    dropout_p = dropout_p if training else 0.0
+    if needs_weights:
+        raise NotImplementedError(
+            f'attn_impl: triton cannot return attn weights.')
+    if key_padding_mask is not None:
+        warnings.warn(
+            'Propagating key_padding_mask to the attention module ' +\
+            'and applying it within the attention module can cause ' +\
+            'unnecessary computation/memory usage. Consider integrating ' +\
+            'into attn_bias once and passing that to each attention ' +\
+            'module instead.'
+        )
+        b_size, s_k = key_padding_mask.shape[:2]
+        if attn_bias is None:
+            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
+        attn_bias = attn_bias.masked_fill(
+            ~key_padding_mask.view((b_size, 1, 1, s_k)),
+            torch.finfo(query.dtype).min)
+    query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
+    key = rearrange(key, 'b s (h d) -> b s h d', h=kv_n_heads)
+    value = rearrange(value, 'b s (h d) -> b s h d', h=kv_n_heads)
+    # multi-query case
+    if kv_n_heads == 1:
+        # necessary to repeat instead of expand tensor because
+        # output contains NaN in edge cases such as with head dimension = 8
+        key = key.repeat(1, 1, n_heads, 1)
+        value = value.repeat(1, 1, n_heads, 1)
+    # grouped query case
+    elif kv_n_heads < n_heads:
+        # Each query belong to a group of kv heads of group size n_heads // kv_n_heads
+        # We repeat each kv head by the group size number to use the underlying MHA kernels
+        key = repeat_kv_for_gqa(key, n_heads // kv_n_heads)
+        value = repeat_kv_for_gqa(value, n_heads // kv_n_heads)
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    attn_output = flash_attn_func(  # type: ignore
+        query, key, value, attn_bias, reset_is_causal, softmax_scale)
+    output = attn_output.view(*attn_output.shape[:2], -1)  # type: ignore
+    return output, None, past_key_value
+class GroupedQueryAttention(nn.Module):
+    """Grouped Query Attention (GQA) is a generalization of Multi-head (MHA).
+    and Multi-query attention (MQA).
+    This allows the user to set a variable of number of kv_n_heads, rather than
+    just n_heads or 1, as in MHA and MQA. Using torch or triton attention
+    implementation enables user to also use additive bias.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        kv_n_heads: int,
+        attn_impl: str = 'triton',
+        clip_qkv: Optional[float] = None,
+        qk_ln: bool = False,
+        softmax_scale: Optional[float] = None,
+        attn_pdrop: float = 0.0,
+        norm_type: str = 'low_precision_layernorm',
+        fc_type: str = 'torch',
+        device: Optional[str] = None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.qk_ln = qk_ln
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.kv_n_heads = kv_n_heads
+        self.head_dim = d_model // n_heads
+        if self.kv_n_heads <= 0:
+            raise ValueError('kv_n_heads should be greater than zero.')
+        if self.kv_n_heads > self.n_heads:
+            raise ValueError(
+                'The number of KV heads should be less than or equal to Q heads.'
+            )
+        if self.n_heads % self.kv_n_heads != 0:
+            raise ValueError(
+                'Each Q head should get the same number of KV heads, so n_heads must be divisible by kv_n_heads.'
+            )
+        self.softmax_scale = softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
+        self.attn_dropout_p = attn_pdrop
+        fc_kwargs: dict[str, Any] = {
+            'bias': bias,
+        }
+        if fc_type != 'te':
+            fc_kwargs['device'] = device
+        self.Wqkv = FC_CLASS_REGISTRY[fc_type](
+            self.d_model,
+            self.d_model + 2 * self.kv_n_heads * self.head_dim,
+            **fc_kwargs,
+        )
+        # for param init fn; enables shape based init of fused layers
+        fuse_splits = [
+            i * self.head_dim
+            for i in range(1, self.n_heads + 2 * self.kv_n_heads)
+        ]
+        self.Wqkv._fused = (0, fuse_splits)
+        if self.qk_ln:
+            norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
+            self.q_ln = norm_class(self.d_model, device=device)
+            self.k_ln = norm_class(self.kv_n_heads * self.head_dim,
+                                   device=device)
+        if self.attn_impl == 'flash':
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == 'triton':
+            self.attn_fn = triton_flash_attn_fn
+        elif self.attn_impl == 'torch':
+            self.attn_fn = scaled_multihead_dot_product_attention
+        else:
+            raise ValueError(f'{attn_impl=} is an invalid setting.')
+        self.out_proj = FC_CLASS_REGISTRY[fc_type](
+            self.d_model,
+            self.d_model,
+            **fc_kwargs,
+        )
+        self.out_proj._is_residual = True
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        is_causal: bool = True,
+        needs_weights: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[
+            torch.Tensor, torch.Tensor]]]:
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+        query, key, value = qkv.split(
+            [
+                self.d_model,
+                self.kv_n_heads * self.head_dim,
+                self.kv_n_heads * self.head_dim,
+            ],
+            dim=2,
+        )
+        key_padding_mask = attention_mask
+        if self.qk_ln:
+            # Applying layernorm to qk
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype)
+            key = self.k_ln(key).to(dtype)
+        context, attn_weights, past_key_value = self.attn_fn(
+            query,
+            key,
+            value,
+            self.n_heads,
+            self.kv_n_heads,
+            past_key_value=past_key_value,
+            softmax_scale=self.softmax_scale,
+            attn_bias=attn_bias,
+            key_padding_mask=key_padding_mask,
+            is_causal=is_causal,
+            dropout_p=self.attn_dropout_p,
+            training=self.training,
+            needs_weights=needs_weights,
+        )
+        return self.out_proj(context), attn_weights, past_key_value
+class MultiheadAttention(GroupedQueryAttention):
+    """Multi-head self attention.
+    Using torch or triton attention implementation enables user to also use
+    additive bias.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        attn_impl: str = 'triton',
+        clip_qkv: Optional[float] = None,
+        qk_ln: bool = False,
+        softmax_scale: Optional[float] = None,
+        attn_pdrop: float = 0.0,
+        norm_type: str = 'low_precision_layernorm',
+        fc_type: str = 'torch',
+        device: Optional[str] = None,
+        bias: bool = True,
+    ):
+        super().__init__(
+            d_model=d_model,
+            n_heads=n_heads,
+            kv_n_heads=n_heads,  # for MHA, same # heads as kv groups
+            attn_impl=attn_impl,
+            clip_qkv=clip_qkv,
+            qk_ln=qk_ln,
+            softmax_scale=softmax_scale,
+            attn_pdrop=attn_pdrop,
+            norm_type=norm_type,
+            fc_type=fc_type,
+            device=device,
+            bias=bias,
+        )
+class MultiQueryAttention(GroupedQueryAttention):
+    """Multi-Query self attention.
+    Using torch or triton attention implementation enables user to also use
+    additive bias.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        attn_impl: str = 'triton',
+        clip_qkv: Optional[float] = None,
+        qk_ln: bool = False,
+        softmax_scale: Optional[float] = None,
+        attn_pdrop: float = 0.0,
+        norm_type: str = 'low_precision_layernorm',
+        fc_type: str = 'torch',
+        device: Optional[str] = None,
+        bias: bool = True,
+    ):
+        super().__init__(
+            d_model=d_model,
+            n_heads=n_heads,
+            kv_n_heads=1,  # for MQA, 1 head
+            attn_impl=attn_impl,
+            clip_qkv=clip_qkv,
+            qk_ln=qk_ln,
+            softmax_scale=softmax_scale,
+            attn_pdrop=attn_pdrop,
+            norm_type=norm_type,
+            fc_type=fc_type,
+            device=device,
+            bias=bias,
+        )
+def attn_bias_shape(
+        attn_impl: str, n_heads: int, seq_len: int, alibi: bool,
+        prefix_lm: bool, causal: bool,
+        use_sequence_id: bool) -> Optional[Tuple[int, int, int, int]]:
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl in ['torch', 'triton']:
+        if alibi:
+            if (prefix_lm or not causal) or use_sequence_id:
+                return (1, n_heads, seq_len, seq_len)
+            return (1, n_heads, 1, seq_len)
+        elif prefix_lm or use_sequence_id:
+            return (1, 1, seq_len, seq_len)
+        return None
+    else:
+        raise ValueError(f'{attn_impl=} is an invalid setting.')
+def build_attn_bias(
+    attn_impl: str,
+    attn_bias: torch.Tensor,
+    n_heads: int,
+    seq_len: int,
+    causal: bool = False,
+    alibi: bool = False,
+    alibi_bias_max: int = 8,
+) -> Optional[torch.Tensor]:
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl in ['torch', 'triton']:
+        if alibi:
+            # in place add alibi to attn bias
+            device, dtype = attn_bias.device, attn_bias.dtype
+            attn_bias = attn_bias.add(
+                build_alibi_bias(
+                    n_heads,
+                    seq_len,
+                    full=not causal,
+                    alibi_bias_max=alibi_bias_max,
+                    device=device,
+                    dtype=dtype,
+                ))
+        return attn_bias
+    else:
+        raise ValueError(f'{attn_impl=} is an invalid setting.')
+def gen_slopes(n_heads: int,
+               alibi_bias_max: int = 8,
+               device: Optional[torch.device] = None) -> torch.Tensor:
+    _n_heads = 2**math.ceil(math.log2(n_heads))
+    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
+    m = m.mul(alibi_bias_max / _n_heads)
+    slopes = (1. / torch.pow(2, m))
+    if _n_heads != n_heads:
+        # if n_heads is not a power of two,
+        # Huggingface and FasterTransformer calculate slopes normally,
+        # then return this strided concatenation of slopes
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
+    return slopes.view(1, n_heads, 1, 1)
+def build_alibi_bias(
+    n_heads: int,
+    seq_len: int,
+    full: bool = False,
+    alibi_bias_max: int = 8,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32,
+                              device=device).view(1, 1, 1, seq_len)
+    if full:
+        # generate 1 x Heads x SeqLen x SeqLen alibi bias mask
+        # otherwise the mask is 1 x Heads x 1 x SeqLen (which is broadcast to the appropriate size)
+        alibi_bias = alibi_bias - torch.arange(
+            1 - seq_len, 1, dtype=torch.int32, device=device).view(
+                1, 1, seq_len, 1)
+        alibi_bias = alibi_bias.abs().mul(-1)
+    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
+    alibi_bias = alibi_bias * slopes
+    return alibi_bias.to(dtype=dtype)
+ATTN_CLASS_REGISTRY = {
+    'multihead_attention': MultiheadAttention,
+    'multiquery_attention': MultiQueryAttention,
+    'grouped_query_attention': GroupedQueryAttention
+}

Perceptrix/finetune/build/lib/llmfoundry/models/layers/blocks.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""GPT Blocks used for the GPT Model."""
+from typing import Any, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from llmfoundry.models.layers.attention import ATTN_CLASS_REGISTRY
+from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, build_ffn
+from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
+class MPTBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        expansion_ratio: int,
+        attn_config: Optional[Dict] = None,
+        ffn_config: Optional[Dict] = None,
+        resid_pdrop: float = 0.0,
+        norm_type: str = 'low_precision_layernorm',
+        fc_type: str = 'torch',
+        device: Optional[str] = None,
+        no_bias: bool = False,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            attn_config = {
+                'attn_type': 'multihead_attention',
+                'attn_pdrop': 0.0,
+                'attn_impl': 'triton',
+                'qk_ln': False,
+                'clip_qkv': None,
+                'softmax_scale': None,
+                'prefix_lm': False,
+                'attn_uses_sequence_id': False,
+                'alibi': False,
+                'alibi_bias_max': 8,
+            }
+        if ffn_config is None:
+            ffn_config = {
+                'ffn_type': 'mptmlp',
+            }
+        del kwargs  # unused, just to capture any extra args from the config
+        super().__init__()
+        norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
+        assert isinstance(attn_config['attn_type'], str)
+        attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
+        # necessary to avoid passing extraneous args into attn_class while allowing the use of **kwargs
+        args_to_exclude_in_attn_class = {
+            'attn_type', 'prefix_lm', 'alibi', 'attn_uses_sequence_id',
+            'alibi_bias_max'
+        }
+        attn_config_subset_for_attn_class = {
+            k: v
+            for k, v in attn_config.items()
+            if k not in args_to_exclude_in_attn_class
+        }
+        self.norm_1 = norm_class(d_model, device=device)
+        self.attn = attn_class(
+            d_model=d_model,
+            n_heads=n_heads,
+            fc_type=fc_type,
+            device=device,
+            **attn_config_subset_for_attn_class,
+            bias=not no_bias,
+        )
+        self.norm_2 = None
+        if not getattr(FFN_CLASS_REGISTRY[ffn_config['ffn_type']], '_has_norm',
+                       False):
+            self.norm_2 = norm_class(d_model, device=device)
+        self.ffn = build_ffn(
+            d_model=d_model,
+            expansion_ratio=expansion_ratio,
+            device=device,
+            bias=not no_bias,
+            **ffn_config,
+        )
+        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
+        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal: bool = True,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[
+            torch.Tensor, torch.Tensor]]]:
+        a = self.norm_1(x)
+        b, attn_weights, past_key_value = self.attn(
+            a,
+            past_key_value=past_key_value,
+            attn_bias=attn_bias,
+            attention_mask=attention_mask,
+            is_causal=is_causal,
+            needs_weights=output_attentions,
+        )
+        x = x + self.resid_attn_dropout(b)
+        m = x
+        if self.norm_2 is not None:
+            m = self.norm_2(x)
+        n = self.ffn(m)
+        x = x + self.resid_ffn_dropout(n)
+        return x, attn_weights, past_key_value

Perceptrix/finetune/build/lib/llmfoundry/models/layers/custom_embedding.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+class SharedEmbedding(nn.Embedding):
+    def forward(self, input: Tensor, unembed: bool = False) -> Tensor:
+        if unembed:
+            return F.linear(input, self.weight)
+        return super().forward(input)