Spaces:
Running
Running
File size: 8,914 Bytes
e7cb6de 2c8408f 67543ef 2c8408f 6334a1e 5de0055 e7cb6de 8d72163 eb84d7e 40a40cb 1fa958e 74c0a8b 9501bef 31e3c37 83f2644 9501bef eb84d7e 9501bef 5de0055 2a1c060 9501bef eb84d7e 2dd0559 51c0840 67543ef 51c0840 b8e290e afb37e6 9501bef 1fa958e eb84d7e 1fa958e de45fdc 1fa958e 207ec3b 1fa958e 1b3d905 1fa958e eb84d7e 1fa958e de45fdc 1fa958e 40a40cb afb37e6 3244268 2c8408f bb88228 d9f26aa 266c7a6 d9f26aa 262dfde d9f26aa bb88228 2a1c060 3265b22 bb88228 3265b22 9b2e5ac bb88228 3265b22 d9f26aa 5622e1c 3265b22 2c8408f 5622e1c 2c8408f bb88228 2c8408f 2a1c060 c85f4b8 b88152c 5933b14 e304b98 eb84d7e e304b98 1c92e81 e8e9bc6 e304b98 e80eb77 1115dfa d9f26aa a8ce2d7 384ad14 1c92e81 e304b98 67543ef 266c7a6 b88152c 67543ef 1c92e81 b88152c 60ca977 6a12a73 02271b4 6a12a73 02271b4 c38e426 6a12a73 8fc9c43 6a12a73 afb37e6 8fc9c43 afb37e6 bb88228 37c61d6 bb88228 3244268 bb88228 b88152c 2c8408f 731bcbf 2dd0559 2c8408f 67543ef d9f26aa 67543ef 2dd0559 d9f26aa b88152c 2dd0559 b88152c e644d1d b88152c 2c8408f a1678bd 93f7595 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import gradio as gr
from langchain.text_splitter import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
Language,
)
from transformers import AutoTokenizer
from overlap import unoverlap_list
LABEL_TEXTSPLITTER = "🦜🔗 LangChain's CharacterTextSplitter"
LABEL_RECURSIVE = "🦜🔗 LangChain's RecursiveCharacterTextSplitter"
bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
def length_tokens(txt):
return len(bert_tokenizer.tokenize(txt))
def extract_separators_from_string(separators_str):
try:
separators_str = separators_str.replace("\\n", "\n").replace("\\t", "\t").replace("\\\\", "\\") # fix special characters
separators = separators_str[1:-1].split(", ")
return [separator.replace('"', "").replace("'", "") for separator in separators]
except Exception as e:
raise gr.Error(f"""
Did not succeed in extracting seperators from string: {separator_str} due to: {str(e)}.
Please type it in the correct format: "['separator_1', 'separator_2', ...]"
""")
def change_split_selection(split_selection):
return (
gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
gr.Radio.update(visible=(split_selection==LABEL_RECURSIVE)),
)
def chunk(text, length, splitter_selection, separators_str, length_unit_selection, chunk_overlap):
separators = extract_separators_from_string(separators_str)
length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
if splitter_selection == LABEL_TEXTSPLITTER:
text_splitter = CharacterTextSplitter(
chunk_size=length,
chunk_overlap=int(chunk_overlap),
length_function=length_function,
strip_whitespace=False,
is_separator_regex=False,
separator=" ",
)
elif splitter_selection == LABEL_RECURSIVE:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=length,
chunk_overlap=int(chunk_overlap),
length_function=length_function,
strip_whitespace=False,
separators=separators,
)
splits = text_splitter.create_documents([text])
text_splits = [split.page_content for split in splits]
unoverlapped_text_splits = unoverlap_list(text_splits)
output = [((split[0], 'Overlap') if split[1] else (split[0], f"Chunk {str(i)}")) for i, split in enumerate(unoverlapped_text_splits)]
return output
def change_preset_separators(choice):
text_splitter = RecursiveCharacterTextSplitter()
if choice == "Default":
return ["\n\n", "\n", " ", ""]
elif choice == "Markdown":
return text_splitter.get_separators_for_language(Language.MARKDOWN)
elif choice == "Python":
return text_splitter.get_separators_for_language(Language.PYTHON)
else:
raise gr.Error("Choice of preset not recognized.")
EXAMPLE_TEXT = """### Chapter 6
WHAT SORT OF DESPOTISM DEMOCRATIC NATIONS HAVE TO FEAR
I had remarked during my stay in the United States that a democratic state of society, similar to that of the Americans, might offer singular facilities for the establishment of despotism; and I perceived, upon my return to Europe, how much use had already been made, by most of our rulers, of the notions, the sentiments, and the wants created by this same social condition, for the purpose of extending the circle of their power. This led me to think that the nations of Christendom would perhaps eventually undergo some oppression like that which hung over several of the nations of the ancient world.
A more accurate examination of the subject, and five years of further meditation, have not diminished my fears, but have changed their object.
No sovereign ever lived in former ages so absolute or so powerful as to undertake to administer by his own agency, and without the assistance of intermediate powers, all the parts of a great empire; none ever attempted to subject all his subjects indiscriminately to strict uniformity of regulation and personally to tutor and direct every member of the community. The notion of such an undertaking never occurred to the human mind; and if any man had conceived it, the want of information, the imperfection of the administrative system, and, above all, the natural obstacles caused by the inequality of conditions would speedily have checked the execution of so vast a design.
---
### Challenges of agent systems
Generally, the difficult parts of running an agent system for the LLM engine are:
1. From supplied tools, choose the one that will help advance to a desired goal: e.g. when asked `"What is the smallest prime number greater than 30,000?"`, the agent could call the `Search` tool with `"What is he height of K2"` but it won't help.
2. Call tools with a rigorous argument formatting: for instance when trying to calculate the speed of a car that went 3 km in 10 minutes, you have to call tool `Calculator` to divide `distance` by `time` : even if your Calculator tool accepts calls in the JSON format: `{”tool”: “Calculator”, “args”: “3km/10min”}` , there are many pitfalls, for instance:
- Misspelling the tool name: `“calculator”` or `“Compute”` wouldn’t work
- Giving the name of the arguments instead of their values: `“args”: “distance/time”`
- Non-standardized formatting: `“args": "3km in 10minutes”`
3. Efficiently ingesting and using the information gathered in the past observations, be it the initial context or the observations returned after using tool uses.
So, how would a complete Agent setup look like?
## Running agents with LangChain
We have just integrated a `ChatHuggingFace` wrapper that lets you create agents based on open-source models in [🦜🔗LangChain](https://www.langchain.com/).
The code to create the ChatModel and give it tools is really simple, you can check it all in the [Langchain doc](https://python.langchain.com/docs/integrations/chat/huggingface).
```python
from langchain_community.llms import HuggingFaceHub
from langchain_community.chat_models.huggingface import ChatHuggingFace
llm = HuggingFaceHub(
repo_id="HuggingFaceH4/zephyr-7b-beta",
task="text-generation",
)
chat_model = ChatHuggingFace(llm=llm)
```
"""
with gr.Blocks(theme=gr.themes.Citrus(text_size='md', font=["monospace"], primary_hue=gr.themes.colors.green)) as demo:
text = gr.Textbox(label="Your text 🪶", value=EXAMPLE_TEXT)
with gr.Row():
split_selection = gr.Dropdown(
choices=[
LABEL_TEXTSPLITTER,
LABEL_RECURSIVE,
],
value=LABEL_RECURSIVE,
label="Method to split chunks 🍞",
)
separators_selection = gr.Textbox(
elem_id="textbox_id",
value=["\n\n", "\n", " ", ""],
info="Separators used in RecursiveCharacterTextSplitter",
show_label=False, # or set label to an empty string if you want to keep its space
visible=True,
)
separator_preset_selection = gr.Radio(
['Default', 'Python', 'Markdown'],
label="Choose a preset",
info="This will apply a specific set of separators to RecursiveCharacterTextSplitter.",
visible=True,
)
with gr.Row():
length_unit_selection = gr.Dropdown(
choices=[
"Character count",
"Token count (BERT tokens)",
],
value="Character count",
label="Length function",
info="How should we measure our chunk lengths?",
)
slider_count = gr.Slider(
50, 500, value=200, step=1, label="Chunk length 📏", info="In the chosen unit."
)
chunk_overlap = gr.Slider(
0, 50, value=10, step=1, label="Overlap between chunks", info="In the chosen unit."
)
out = gr.HighlightedText(
label="Output",
show_legend=True,
show_label=False,
color_map={'Overlap': '#DADADA'}
)
split_selection.change(
fn=change_split_selection,
inputs=split_selection,
outputs=[separators_selection, separator_preset_selection],
)
separator_preset_selection.change(
fn=change_preset_separators,
inputs=separator_preset_selection,
outputs=separators_selection,
)
gr.on(
[text.change, length_unit_selection.change, separators_selection.change, split_selection.change, slider_count.change, chunk_overlap.change],
chunk,
[text, slider_count, split_selection, separators_selection, length_unit_selection, chunk_overlap],
outputs=out
)
demo.load(chunk, inputs=[text, slider_count, split_selection, separators_selection, length_unit_selection, chunk_overlap], outputs=out)
demo.launch() |