Spaces:
Sleeping
Sleeping
Commit
·
7a94c8b
1
Parent(s):
8b05b02
commit
Browse files- __pycache__/main.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- main.py +126 -171
- static/assets/{index-66194b32.js → index-7974ca0c.js} +0 -0
- static/index.html +1 -1
__pycache__/main.cpython-310.pyc
ADDED
Binary file (12 kB). View file
|
|
__pycache__/utils.cpython-310.pyc
ADDED
Binary file (1.16 kB). View file
|
|
main.py
CHANGED
@@ -16,15 +16,15 @@ from pprint import pprint
|
|
16 |
import asyncio
|
17 |
import importlib.util
|
18 |
import traceback
|
19 |
-
|
20 |
import sys
|
21 |
import json
|
22 |
import jsonschema
|
23 |
-
# import aiosqlite
|
24 |
from utils import extract_code
|
25 |
import numpy as np
|
26 |
import os
|
27 |
import requests
|
|
|
|
|
28 |
|
29 |
app = FastAPI()
|
30 |
|
@@ -32,54 +32,42 @@ client_id = os.getenv("OAUTH_CLIENT_ID")
|
|
32 |
client_secret = os.getenv("OAUTH_CLIENT_SECRET")
|
33 |
space_host = os.getenv("SPACE_HOST")
|
34 |
|
35 |
-
# DATABASE_FILE = "samples.db"
|
36 |
-
|
37 |
-
|
38 |
client = OpenAI(
|
39 |
-
|
40 |
-
|
41 |
)
|
42 |
|
43 |
-
|
44 |
-
# async def setup_database():
|
45 |
-
# async with aiosqlite.connect(DATABASE_FILE) as db:
|
46 |
-
# await db.execute("""
|
47 |
-
# CREATE TABLE IF NOT EXISTS samples (
|
48 |
-
# hash TEXT PRIMARY KEY,
|
49 |
-
# data TEXT NOT NULL,
|
50 |
-
# dataset TEXT NOT NULL
|
51 |
-
# )
|
52 |
-
# """)
|
53 |
-
# await db.commit()
|
54 |
-
|
55 |
-
# async def insert_sample(hash: str, data: str, dataset: str):
|
56 |
-
# async with aiosqlite.connect(DATABASE_FILE) as db:
|
57 |
-
# # Check if a record with the same hash already exists
|
58 |
-
# cursor = await db.execute("SELECT COUNT(*) FROM samples WHERE hash = ?", (hash,))
|
59 |
-
# count = await cursor.fetchone()
|
60 |
-
|
61 |
-
# if count[0] == 0:
|
62 |
-
# # Insert the new record since it doesn't exist
|
63 |
-
# await db.execute("INSERT INTO samples (hash, data, dataset) VALUES (?, ?, ?)", (hash, data, dataset))
|
64 |
-
# await db.commit()
|
65 |
-
# else:
|
66 |
-
# # A record with the same hash already exists
|
67 |
-
# print("Record with the same hash already exists in the database.")
|
68 |
-
|
69 |
-
# async def get_sample_by_hash(hash: str):
|
70 |
-
# async with aiosqlite.connect(DATABASE_FILE) as db:
|
71 |
-
# cursor = await db.execute("SELECT data, dataset FROM samples WHERE hash = ?", (hash,))
|
72 |
-
# row = await cursor.fetchone()
|
73 |
-
# return row
|
74 |
|
75 |
def is_sharegpt(sample):
|
76 |
-
schema=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
try:
|
78 |
jsonschema.validate(instance=sample, schema=schema)
|
79 |
return True
|
80 |
except jsonschema.exceptions.ValidationError as e:
|
81 |
return False
|
82 |
|
|
|
83 |
def sha256(string):
|
84 |
# Create a hashlib object for SHA-256
|
85 |
sha256_hash = hashlib.sha256()
|
@@ -88,6 +76,7 @@ def sha256(string):
|
|
88 |
|
89 |
return sha256_hash.hexdigest()
|
90 |
|
|
|
91 |
def get_adapter_name(sample):
|
92 |
builder = SchemaBuilder()
|
93 |
builder.add_object(sample)
|
@@ -95,6 +84,7 @@ def get_adapter_name(sample):
|
|
95 |
|
96 |
return sha256(json.dumps(schema))
|
97 |
|
|
|
98 |
def has_adapter(sample):
|
99 |
adapter_name = get_adapter_name(sample)
|
100 |
|
@@ -106,37 +96,26 @@ def has_adapter(sample):
|
|
106 |
|
107 |
return True
|
108 |
|
|
|
109 |
def auto_tranform(sample):
|
110 |
adapter_name = get_adapter_name(sample)
|
111 |
if not has_adapter(sample):
|
112 |
create_adapter(sample, adapter_name)
|
113 |
|
114 |
module_name = f"dataset_adapters.{adapter_name}"
|
115 |
-
spec = importlib.util.spec_from_file_location(
|
|
|
116 |
dynamic_module = importlib.util.module_from_spec(spec)
|
117 |
sys.modules[module_name] = dynamic_module
|
118 |
spec.loader.exec_module(dynamic_module)
|
119 |
|
120 |
-
# Use the function from the dynamically imported module
|
121 |
transformed_data = dynamic_module.transform_data(sample)
|
122 |
|
123 |
if isinstance(transformed_data, list):
|
124 |
-
return {'conversations'
|
125 |
-
|
126 |
|
127 |
return transformed_data
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
# def create_adapter(sample, adapter_name):
|
133 |
-
# builder = SchemaBuilder()
|
134 |
-
# builder.add_object(sample)
|
135 |
-
# schema = builder.to_schema()
|
136 |
-
|
137 |
-
# code_string = """def transform_data(data):
|
138 |
-
# raise Exception('')"""
|
139 |
-
|
140 |
with open(f"dataset_adapters/{adapter_name}.py", 'w') as file:
|
141 |
file.write(code_string)
|
142 |
|
@@ -146,7 +125,7 @@ def create_adapter(sample, adapter_name):
|
|
146 |
builder.add_object(sample)
|
147 |
schema = builder.to_schema()
|
148 |
|
149 |
-
prompt = f"""Make me minimal and efficient python code to convert data in the shape of
|
150 |
|
151 |
initial data shape
|
152 |
==========➡️📑📐==========
|
@@ -177,65 +156,26 @@ For transforming the data you shall use python. Make robust and elegant python c
|
|
177 |
|
178 |
your code will contain a function `def transform_data(data):` that does the transformation and outputs the newly shaped data. Only the data, no schema. Your code snippet will include only the function signature and body. I know how to call it. You won't need to import anything, I will take care of parsing and dumping json. You work with dicts. Remember to be careful if you iterate over the data because I want the output conversation to always start with the prompt. In other words, always process "input" before "output" and "instruction" before "output". Such heuristics are very important. If there is "instruction" and "input" and the "input" is not empty, concat the input at the end of the first message. If the data contains no "system" message, human always speaks first. If it contains a "system" message, the "system" message is first, then human, then gpt, then alternating if needed
|
179 |
|
180 |
-
"human" ALWAYS SPEAKS BEFORE "gpt", if you suspect your code makes "gpt speak first, fix it
|
181 |
|
182 |
MOST IMPORTANT IS THAT YOU look at the initial data shape (➡️📑📐) to ground your transformation into final data shape (⬇️📑📐)
|
183 |
|
184 |
Your output should contain only the code for `def transform_data(data):`, signature and body. Put the code inside markdown code block"""
|
185 |
|
186 |
response = client.chat.completions.create(
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
Knowledge cutoff: 2023-04
|
191 |
Current date: 2023-11-05
|
192 |
|
193 |
-
Image input capabilities: Enabled"""
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
# {json.dumps(schema)}
|
198 |
-
# ```
|
199 |
-
|
200 |
-
# to equivalent data in the form ```
|
201 |
-
# {{'$schema': 'http://json-schema.org/schema#', 'type': 'object', 'properties': {{'conversations': {{'type': 'array', 'items': {{'type': 'object', 'properties': {{'from': {{ 'type': 'string', 'enum': ['human', 'gpt', 'system'] }}, 'value': {{'type': 'string'}}}}, 'required': ['from', 'value']}}}}}}, 'required': ['conversations']}}
|
202 |
-
# ```
|
203 |
-
|
204 |
-
# the input is
|
205 |
-
# ```
|
206 |
-
# {json.dumps(sample)}
|
207 |
-
# ```
|
208 |
-
|
209 |
-
|
210 |
-
# `input` is usually associated with `"from" : "human"` while `output` is usually associated with `"from" : "gpt"`
|
211 |
-
|
212 |
-
# don't transform, make robust and elegant python code that will do the transformation
|
213 |
-
|
214 |
-
|
215 |
-
# your code will contain a function `def transform_data(data):` that does the transformation and outputs the newly shaped data. Only the data, no schema. Your code snippet will include only the function signature and body. I know how to call it. You won't need to import anything, I will take care of parsing and dumping json. You work with dicts. Remember to be careful if you iterate over the data because I want the output conversation to always start with the prompt. In other words, always process "input" before "output" and "instruction" before "output". Such heuristics are very important. If there is "instruction" and "input" and the "input" is not empty, concat the input at the end of the first message."""
|
216 |
-
# }
|
217 |
-
{"role": "user", "content": prompt}
|
218 |
-
]
|
219 |
-
)
|
220 |
|
221 |
val = response.choices[0].message.content
|
222 |
-
# index = val.index('def transform_data(data)')
|
223 |
-
|
224 |
-
# def get_code_start():
|
225 |
-
# for i in range(index,0,-1):
|
226 |
-
# if val[i:i+3] == "```":
|
227 |
-
# idx = val[i:].index('\n')
|
228 |
-
# return i + (idx) + 1
|
229 |
-
|
230 |
-
# def get_code_end():
|
231 |
-
# for i in range(index, len(val)):
|
232 |
-
# if val[i:i+3] == "```":
|
233 |
-
# return i-1
|
234 |
|
235 |
-
# code_string = val[get_code_start():get_code_end()]
|
236 |
-
|
237 |
-
|
238 |
-
# print("###", val)
|
239 |
code_string = extract_code(val)
|
240 |
|
241 |
if code_string is None:
|
@@ -251,28 +191,59 @@ async def get_sample(hash: str = Query(..., alias="hash")):
|
|
251 |
if res is None:
|
252 |
raise HTTPException(status_code=404, detail="Item not found")
|
253 |
data, dataset = res
|
254 |
-
sample= auto_tranform(json.loads(data))
|
255 |
return {'sample': sample, 'dataset': dataset}
|
256 |
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
queue = asyncio.Queue()
|
260 |
-
|
261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
|
|
|
|
|
|
|
|
264 |
|
265 |
|
|
|
|
|
|
|
266 |
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
import requests
|
274 |
headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN')}"}
|
275 |
API_URL = f"https://datasets-server.huggingface.co/info?dataset={dataset_name}"
|
|
|
276 |
def query():
|
277 |
response = requests.get(API_URL, headers=headers)
|
278 |
return response.json()
|
@@ -283,15 +254,15 @@ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str
|
|
283 |
|
284 |
num_samples = split['num_examples']
|
285 |
split_name = split['name']
|
286 |
-
|
287 |
-
# dataset = load_dataset(dataset_name, split=split_name, streaming=True)
|
288 |
-
idx = random.randint(0, num_samples) if index is None else int(index)
|
289 |
|
|
|
|
|
290 |
|
291 |
API_URL = f"https://datasets-server.huggingface.co/rows?dataset={dataset_name}&config=default&split=train&offset={idx}&length=1"
|
292 |
-
|
293 |
def query():
|
294 |
-
headers = {
|
|
|
295 |
response = requests.get(API_URL, headers=headers)
|
296 |
|
297 |
if response.status_code != 200:
|
@@ -301,15 +272,7 @@ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str
|
|
301 |
|
302 |
random_sample = data['rows'][0]['row']
|
303 |
|
304 |
-
# pprint(random_sample)
|
305 |
-
|
306 |
-
|
307 |
-
# selected = dataset.skip(idx)
|
308 |
-
# random_sample = next(iter(selected))#random.choice(samples_buffer)
|
309 |
-
|
310 |
hashed = sha256(json.dumps(random_sample))
|
311 |
-
# insert_sample(hashed, json.dumps(random_sample), dataset_name)
|
312 |
-
# background_tasks.add_task(insert_sample, hashed, json.dumps(random_sample), dataset_name)
|
313 |
|
314 |
except Exception as e:
|
315 |
message = ""
|
@@ -317,9 +280,9 @@ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str
|
|
317 |
message = e.message
|
318 |
else:
|
319 |
message = str(e)
|
320 |
-
|
321 |
print("error : ", message)
|
322 |
-
yield f"data: {json.dumps({'status': 'error', 'message' : message })}\n\n"
|
323 |
|
324 |
transformed_data = random_sample
|
325 |
|
@@ -328,7 +291,7 @@ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str
|
|
328 |
if not is_sharegpt(random_sample):
|
329 |
try:
|
330 |
if not has_adapter(random_sample):
|
331 |
-
yield f"data: {json.dumps({'status': 'creating_adapter'})}\n\n"
|
332 |
|
333 |
transformed_data = auto_tranform(random_sample)
|
334 |
except Exception as e:
|
@@ -337,27 +300,29 @@ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str
|
|
337 |
print("error : ", e.message)
|
338 |
else:
|
339 |
print("error : ", e)
|
340 |
-
yield f"data: {json.dumps({'status': 'error'})}\n\n"
|
341 |
|
342 |
if success:
|
343 |
-
yield f"data: {json.dumps({'status': 'done', 'data' : transformed_data, 'index' : str(idx)})}\n\n"
|
344 |
-
|
345 |
-
return StreamingResponse(event_stream(queue), media_type="text/event-stream")
|
346 |
|
|
|
|
|
|
|
347 |
|
348 |
|
349 |
@app.get("/random-sample")
|
350 |
async def get_random_sample(dataset_name: str = Query(..., alias="dataset-name")):
|
351 |
try:
|
352 |
-
dataset = load_dataset(dataset_name,streaming=True)
|
353 |
split = [key for key in dataset.keys() if "train" in key]
|
354 |
dataset = load_dataset(dataset_name, split=split[0], streaming=True)
|
355 |
|
356 |
buffer_size = 100 # Define a reasonable buffer size
|
357 |
-
samples_buffer = [
|
358 |
-
|
359 |
-
|
360 |
|
|
|
361 |
|
362 |
hashed = sha256(json.dumps(random_sample))
|
363 |
|
@@ -368,12 +333,12 @@ async def get_random_sample(dataset_name: str = Query(..., alias="dataset-name")
|
|
368 |
if module_spec is None:
|
369 |
create_adapter(random_sample, sanitized)
|
370 |
|
371 |
-
spec = importlib.util.spec_from_file_location(
|
|
|
372 |
dynamic_module = importlib.util.module_from_spec(spec)
|
373 |
sys.modules[module_name] = dynamic_module
|
374 |
spec.loader.exec_module(dynamic_module)
|
375 |
|
376 |
-
# Use the function from the dynamically imported module
|
377 |
transformed_data = dynamic_module.transform_data(random_sample)
|
378 |
|
379 |
return transformed_data
|
@@ -384,16 +349,14 @@ async def get_random_sample(dataset_name: str = Query(..., alias="dataset-name")
|
|
384 |
raise HTTPException(status_code=500, detail=str(e))
|
385 |
|
386 |
|
387 |
-
|
388 |
@app.get("/login/callback")
|
389 |
async def oauth_callback(code: str, state: str):
|
390 |
-
# Prepare the authorization header
|
391 |
credentials = f"{client_id}:{client_secret}"
|
392 |
credentials_bytes = credentials.encode("ascii")
|
393 |
base64_credentials = base64.b64encode(credentials_bytes)
|
394 |
auth_header = f"Basic {base64_credentials.decode('ascii')}"
|
395 |
username = ""
|
396 |
-
|
397 |
try:
|
398 |
token_response = requests.post(
|
399 |
'https://huggingface.co/oauth/token',
|
@@ -411,31 +374,26 @@ async def oauth_callback(code: str, state: str):
|
|
411 |
tokens = token_response.json()
|
412 |
access_token = tokens.get('access_token')
|
413 |
|
414 |
-
|
415 |
if access_token:
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
else:
|
427 |
-
username = ""
|
428 |
-
print(f"Error getting user data: {user_response.status_code}, {user_response.text}")
|
429 |
-
|
430 |
-
except Exception:
|
431 |
-
traceback.print_exc()
|
432 |
-
username = ""
|
433 |
-
else:
|
434 |
-
username = ""
|
435 |
|
|
|
436 |
|
437 |
-
|
438 |
-
|
|
|
|
|
|
|
|
|
439 |
else:
|
440 |
access_token = ""
|
441 |
|
@@ -443,23 +401,20 @@ async def oauth_callback(code: str, state: str):
|
|
443 |
traceback.print_exc()
|
444 |
access_token = ""
|
445 |
|
446 |
-
return {"access_token": access_token, "username"
|
|
|
447 |
|
448 |
@app.get("/oauth-config")
|
449 |
async def get_oauth_config(request: Request):
|
450 |
-
# client_host = "https://huggingface.co/spaces/thomasgauthier/ChatExplorer#request.client.host
|
451 |
return {
|
452 |
"client_id": client_id,
|
453 |
"redirect_uri": f'https://{space_host}/login/callback'
|
454 |
}
|
455 |
|
456 |
|
457 |
-
# # @app.on_event("startup")
|
458 |
-
# # async def startup_event():
|
459 |
-
# # await setup_database()
|
460 |
@app.get("/")
|
461 |
def index() -> FileResponse:
|
462 |
return FileResponse(path="static/index.html", media_type="text/html")
|
463 |
|
464 |
|
465 |
-
app.mount("/", StaticFiles(directory="static"), name="static")
|
|
|
16 |
import asyncio
|
17 |
import importlib.util
|
18 |
import traceback
|
|
|
19 |
import sys
|
20 |
import json
|
21 |
import jsonschema
|
|
|
22 |
from utils import extract_code
|
23 |
import numpy as np
|
24 |
import os
|
25 |
import requests
|
26 |
+
import secrets
|
27 |
+
import urllib.parse
|
28 |
|
29 |
app = FastAPI()
|
30 |
|
|
|
32 |
client_secret = os.getenv("OAUTH_CLIENT_SECRET")
|
33 |
space_host = os.getenv("SPACE_HOST")
|
34 |
|
|
|
|
|
|
|
35 |
client = OpenAI(
|
36 |
+
base_url="https://openrouter.ai/api/v1",
|
37 |
+
api_key=os.environ.get('OPENROUTER_KEY')
|
38 |
)
|
39 |
|
40 |
+
state_queue_map = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
def is_sharegpt(sample):
|
43 |
+
schema = {
|
44 |
+
'$schema': 'http://json-schema.org/schema#',
|
45 |
+
'type': 'object',
|
46 |
+
'properties': {
|
47 |
+
'conversations': {
|
48 |
+
'type': 'array',
|
49 |
+
'items': {
|
50 |
+
'type': 'object',
|
51 |
+
'properties': {
|
52 |
+
'from': {
|
53 |
+
'type': 'string',
|
54 |
+
'enum': [
|
55 |
+
'human',
|
56 |
+
'gpt',
|
57 |
+
'system']},
|
58 |
+
'value': {
|
59 |
+
'type': 'string'}},
|
60 |
+
'required': [
|
61 |
+
'from',
|
62 |
+
'value']}}},
|
63 |
+
'required': ['conversations']}
|
64 |
try:
|
65 |
jsonschema.validate(instance=sample, schema=schema)
|
66 |
return True
|
67 |
except jsonschema.exceptions.ValidationError as e:
|
68 |
return False
|
69 |
|
70 |
+
|
71 |
def sha256(string):
|
72 |
# Create a hashlib object for SHA-256
|
73 |
sha256_hash = hashlib.sha256()
|
|
|
76 |
|
77 |
return sha256_hash.hexdigest()
|
78 |
|
79 |
+
|
80 |
def get_adapter_name(sample):
|
81 |
builder = SchemaBuilder()
|
82 |
builder.add_object(sample)
|
|
|
84 |
|
85 |
return sha256(json.dumps(schema))
|
86 |
|
87 |
+
|
88 |
def has_adapter(sample):
|
89 |
adapter_name = get_adapter_name(sample)
|
90 |
|
|
|
96 |
|
97 |
return True
|
98 |
|
99 |
+
|
100 |
def auto_tranform(sample):
|
101 |
adapter_name = get_adapter_name(sample)
|
102 |
if not has_adapter(sample):
|
103 |
create_adapter(sample, adapter_name)
|
104 |
|
105 |
module_name = f"dataset_adapters.{adapter_name}"
|
106 |
+
spec = importlib.util.spec_from_file_location(
|
107 |
+
module_name, f"dataset_adapters/{adapter_name}.py")
|
108 |
dynamic_module = importlib.util.module_from_spec(spec)
|
109 |
sys.modules[module_name] = dynamic_module
|
110 |
spec.loader.exec_module(dynamic_module)
|
111 |
|
|
|
112 |
transformed_data = dynamic_module.transform_data(sample)
|
113 |
|
114 |
if isinstance(transformed_data, list):
|
115 |
+
return {'conversations': transformed_data}
|
|
|
116 |
|
117 |
return transformed_data
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
with open(f"dataset_adapters/{adapter_name}.py", 'w') as file:
|
120 |
file.write(code_string)
|
121 |
|
|
|
125 |
builder.add_object(sample)
|
126 |
schema = builder.to_schema()
|
127 |
|
128 |
+
prompt = f"""Make me minimal and efficient python code to convert data in the shape of
|
129 |
|
130 |
initial data shape
|
131 |
==========➡️📑📐==========
|
|
|
156 |
|
157 |
your code will contain a function `def transform_data(data):` that does the transformation and outputs the newly shaped data. Only the data, no schema. Your code snippet will include only the function signature and body. I know how to call it. You won't need to import anything, I will take care of parsing and dumping json. You work with dicts. Remember to be careful if you iterate over the data because I want the output conversation to always start with the prompt. In other words, always process "input" before "output" and "instruction" before "output". Such heuristics are very important. If there is "instruction" and "input" and the "input" is not empty, concat the input at the end of the first message. If the data contains no "system" message, human always speaks first. If it contains a "system" message, the "system" message is first, then human, then gpt, then alternating if needed
|
158 |
|
159 |
+
"human" ALWAYS SPEAKS BEFORE "gpt", if you suspect your code makes "gpt speak first, fix it
|
160 |
|
161 |
MOST IMPORTANT IS THAT YOU look at the initial data shape (➡️📑📐) to ground your transformation into final data shape (⬇️📑📐)
|
162 |
|
163 |
Your output should contain only the code for `def transform_data(data):`, signature and body. Put the code inside markdown code block"""
|
164 |
|
165 |
response = client.chat.completions.create(
|
166 |
+
model="openai/gpt-4-1106-preview",
|
167 |
+
messages=[
|
168 |
+
{"role": "system", "content": """You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.
|
169 |
Knowledge cutoff: 2023-04
|
170 |
Current date: 2023-11-05
|
171 |
|
172 |
+
Image input capabilities: Enabled"""},
|
173 |
+
{"role": "user", "content": prompt}
|
174 |
+
]
|
175 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
val = response.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
|
|
|
|
|
|
|
|
179 |
code_string = extract_code(val)
|
180 |
|
181 |
if code_string is None:
|
|
|
191 |
if res is None:
|
192 |
raise HTTPException(status_code=404, detail="Item not found")
|
193 |
data, dataset = res
|
194 |
+
sample = auto_tranform(json.loads(data))
|
195 |
return {'sample': sample, 'dataset': dataset}
|
196 |
|
197 |
+
|
198 |
+
def generate_random_string(length=16):
|
199 |
+
return secrets.token_hex(length)
|
200 |
+
|
201 |
+
|
202 |
+
|
203 |
+
@app.get("/oauth_token")
|
204 |
+
async def get_oauth_token():
|
205 |
queue = asyncio.Queue()
|
206 |
+
|
207 |
+
async def event_stream(queue, state):
|
208 |
+
state_queue_map[state] = queue
|
209 |
+
|
210 |
+
redirect_uri = f'https://{space_host}/login/callback'
|
211 |
+
|
212 |
+
auth_url = (
|
213 |
+
f"https://huggingface.co/oauth/authorize?"
|
214 |
+
f"redirect_uri={urllib.parse.quote(redirect_uri)}&"
|
215 |
+
f"client_id={client_id}&"
|
216 |
+
f"scope=openid%20profile&"
|
217 |
+
f"response_type=code&"
|
218 |
+
f"state={state}"
|
219 |
+
)
|
220 |
+
yield f"data: {json.dumps({ 'url' : auth_url })}\n\n"
|
221 |
+
|
222 |
try:
|
223 |
+
while True:
|
224 |
+
message = await queue.get()
|
225 |
+
if 'end_stream' in message and message['end_stream']:
|
226 |
+
break
|
227 |
+
yield f"data: {json.dumps(message)}\n\n"
|
228 |
+
finally:
|
229 |
+
del state_queue_map[state]
|
230 |
|
231 |
+
state = generate_random_string()
|
232 |
+
return StreamingResponse(
|
233 |
+
event_stream(queue, state),
|
234 |
+
media_type="text/event-stream")
|
235 |
|
236 |
|
237 |
+
@app.get("/random-sample-stream")
|
238 |
+
async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str = Query(..., alias="dataset-name"), index: str = Query(None, alias="index")):
|
239 |
+
queue = asyncio.Queue()
|
240 |
|
241 |
+
def event_stream(queue):
|
242 |
+
yield f"data: {json.dumps({'status': 'grab_sample'})}\n\n"
|
243 |
+
try:
|
|
|
|
|
|
|
|
|
244 |
headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN')}"}
|
245 |
API_URL = f"https://datasets-server.huggingface.co/info?dataset={dataset_name}"
|
246 |
+
|
247 |
def query():
|
248 |
response = requests.get(API_URL, headers=headers)
|
249 |
return response.json()
|
|
|
254 |
|
255 |
num_samples = split['num_examples']
|
256 |
split_name = split['name']
|
|
|
|
|
|
|
257 |
|
258 |
+
idx = random.randint(
|
259 |
+
0, num_samples) if index is None else int(index)
|
260 |
|
261 |
API_URL = f"https://datasets-server.huggingface.co/rows?dataset={dataset_name}&config=default&split=train&offset={idx}&length=1"
|
262 |
+
|
263 |
def query():
|
264 |
+
headers = {
|
265 |
+
"Authorization": f"Bearer {os.environ.get('HF_TOKEN')}"}
|
266 |
response = requests.get(API_URL, headers=headers)
|
267 |
|
268 |
if response.status_code != 200:
|
|
|
272 |
|
273 |
random_sample = data['rows'][0]['row']
|
274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
hashed = sha256(json.dumps(random_sample))
|
|
|
|
|
276 |
|
277 |
except Exception as e:
|
278 |
message = ""
|
|
|
280 |
message = e.message
|
281 |
else:
|
282 |
message = str(e)
|
283 |
+
|
284 |
print("error : ", message)
|
285 |
+
yield f"data: {json.dumps({'status': 'error', 'message' : message })}\n\n"
|
286 |
|
287 |
transformed_data = random_sample
|
288 |
|
|
|
291 |
if not is_sharegpt(random_sample):
|
292 |
try:
|
293 |
if not has_adapter(random_sample):
|
294 |
+
yield f"data: {json.dumps({'status': 'creating_adapter'})}\n\n"
|
295 |
|
296 |
transformed_data = auto_tranform(random_sample)
|
297 |
except Exception as e:
|
|
|
300 |
print("error : ", e.message)
|
301 |
else:
|
302 |
print("error : ", e)
|
303 |
+
yield f"data: {json.dumps({'status': 'error'})}\n\n"
|
304 |
|
305 |
if success:
|
306 |
+
yield f"data: {json.dumps({'status': 'done', 'data' : transformed_data, 'index' : str(idx)})}\n\n"
|
|
|
|
|
307 |
|
308 |
+
return StreamingResponse(
|
309 |
+
event_stream(queue),
|
310 |
+
media_type="text/event-stream")
|
311 |
|
312 |
|
313 |
@app.get("/random-sample")
|
314 |
async def get_random_sample(dataset_name: str = Query(..., alias="dataset-name")):
|
315 |
try:
|
316 |
+
dataset = load_dataset(dataset_name, streaming=True)
|
317 |
split = [key for key in dataset.keys() if "train" in key]
|
318 |
dataset = load_dataset(dataset_name, split=split[0], streaming=True)
|
319 |
|
320 |
buffer_size = 100 # Define a reasonable buffer size
|
321 |
+
samples_buffer = [
|
322 |
+
sample for _, sample in zip(
|
323 |
+
range(buffer_size), dataset)]
|
324 |
|
325 |
+
random_sample = random.choice(samples_buffer)
|
326 |
|
327 |
hashed = sha256(json.dumps(random_sample))
|
328 |
|
|
|
333 |
if module_spec is None:
|
334 |
create_adapter(random_sample, sanitized)
|
335 |
|
336 |
+
spec = importlib.util.spec_from_file_location(
|
337 |
+
module_name, f"dataset_adapters/{sanitized}.py")
|
338 |
dynamic_module = importlib.util.module_from_spec(spec)
|
339 |
sys.modules[module_name] = dynamic_module
|
340 |
spec.loader.exec_module(dynamic_module)
|
341 |
|
|
|
342 |
transformed_data = dynamic_module.transform_data(random_sample)
|
343 |
|
344 |
return transformed_data
|
|
|
349 |
raise HTTPException(status_code=500, detail=str(e))
|
350 |
|
351 |
|
|
|
352 |
@app.get("/login/callback")
|
353 |
async def oauth_callback(code: str, state: str):
|
|
|
354 |
credentials = f"{client_id}:{client_secret}"
|
355 |
credentials_bytes = credentials.encode("ascii")
|
356 |
base64_credentials = base64.b64encode(credentials_bytes)
|
357 |
auth_header = f"Basic {base64_credentials.decode('ascii')}"
|
358 |
username = ""
|
359 |
+
|
360 |
try:
|
361 |
token_response = requests.post(
|
362 |
'https://huggingface.co/oauth/token',
|
|
|
374 |
tokens = token_response.json()
|
375 |
access_token = tokens.get('access_token')
|
376 |
|
|
|
377 |
if access_token:
|
378 |
+
url = "https://huggingface.co/oauth/userinfo"
|
379 |
+
|
380 |
+
payload = ""
|
381 |
+
headers = {
|
382 |
+
"Content-Type": "application/json",
|
383 |
+
"Authorization": f"Bearer {access_token}"
|
384 |
+
}
|
385 |
+
|
386 |
+
response = requests.request(
|
387 |
+
"GET", url, data=payload, headers=headers)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
+
print(response.text)
|
390 |
|
391 |
+
if state in state_queue_map:
|
392 |
+
queue = state_queue_map[state]
|
393 |
+
await queue.put({"access_token": access_token, "username": username})
|
394 |
+
await queue.put({"end_stream": True})
|
395 |
+
else:
|
396 |
+
username = ""
|
397 |
else:
|
398 |
access_token = ""
|
399 |
|
|
|
401 |
traceback.print_exc()
|
402 |
access_token = ""
|
403 |
|
404 |
+
return {"access_token": access_token, "username": username}
|
405 |
+
|
406 |
|
407 |
@app.get("/oauth-config")
|
408 |
async def get_oauth_config(request: Request):
|
|
|
409 |
return {
|
410 |
"client_id": client_id,
|
411 |
"redirect_uri": f'https://{space_host}/login/callback'
|
412 |
}
|
413 |
|
414 |
|
|
|
|
|
|
|
415 |
@app.get("/")
|
416 |
def index() -> FileResponse:
|
417 |
return FileResponse(path="static/index.html", media_type="text/html")
|
418 |
|
419 |
|
420 |
+
app.mount("/", StaticFiles(directory="static"), name="static")
|
static/assets/{index-66194b32.js → index-7974ca0c.js}
RENAMED
The diff for this file is too large to render.
See raw diff
|
|
static/index.html
CHANGED
@@ -5,7 +5,7 @@
|
|
5 |
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
7 |
<title>Vite + Preact</title>
|
8 |
-
<script type="module" crossorigin src="/assets/index-
|
9 |
<link rel="stylesheet" href="/assets/index-abe6d7fb.css">
|
10 |
</head>
|
11 |
<body ondrop="event.preventDefault()" >
|
|
|
5 |
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
7 |
<title>Vite + Preact</title>
|
8 |
+
<script type="module" crossorigin src="/assets/index-7974ca0c.js"></script>
|
9 |
<link rel="stylesheet" href="/assets/index-abe6d7fb.css">
|
10 |
</head>
|
11 |
<body ondrop="event.preventDefault()" >
|