|
import openai |
|
import gradio as gr |
|
from gradio.components import Audio, Textbox |
|
import os |
|
import re |
|
from transformers import GPT2Tokenizer |
|
import whisper |
|
import pandas as pd |
|
from datetime import datetime, timezone, timedelta |
|
import notion_df |
|
|
|
openai.api_key = os.environ["OPENAI_API_KEY"] |
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
|
|
|
initial_message = {"role": "system", "content": 'You are a USMLE Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.'} |
|
messages = [initial_message] |
|
|
|
answer_count = 0 |
|
|
|
|
|
model = whisper.load_model("base") |
|
|
|
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"): |
|
"""Returns the number of tokens used by a list of messages.""" |
|
try: |
|
encoding = tiktoken.encoding_for_model(model) |
|
except KeyError: |
|
encoding = tiktoken.get_encoding("cl100k_base") |
|
if model == "gpt-3.5-turbo-0301": |
|
num_tokens = 0 |
|
for message in messages: |
|
num_tokens += 4 |
|
for key, value in message.items(): |
|
num_tokens += len(encoding.encode(value)) |
|
if key == "name": |
|
num_tokens += -1 |
|
num_tokens += 2 |
|
return num_tokens |
|
else: |
|
raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}. |
|
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""") |
|
|
|
def transcribe(audio, text): |
|
global messages |
|
global answer_count |
|
|
|
if audio is not None: |
|
audio_file = open(audio, "rb") |
|
transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en") |
|
messages.append({"role": "user", "content": transcript["text"]}) |
|
|
|
if text is not None: |
|
|
|
sentences = re.split("(?<=[.!?]) +", text) |
|
|
|
|
|
sentence_tokens = [tokenizer.encode(sentence) for sentence in sentences] |
|
|
|
|
|
input_tokens = [token for sentence in sentence_tokens for token in sentence] |
|
|
|
|
|
num_tokens = num_tokens_from_messages(messages) |
|
if num_tokens + len(input_tokens) > 2200: |
|
|
|
messages = [initial_message] |
|
answer_count = 0 |
|
input_text = 'Can you click the Submit button one more time? (say Yes)' |
|
messages.append({"role": "user", "content": input_text}) |
|
else: |
|
|
|
input_text = tokenizer.decode(input_tokens) |
|
messages.append({"role": "user", "content": input_text}) |
|
|
|
|
|
num_tokens = num_tokens_from_messages(messages) |
|
if num_tokens > 2096: |
|
|
|
chat_transcript = "" |
|
for message in messages: |
|
if message['role'] != 'system': |
|
chat_transcript += f"[ANSWER {answer_count}]{message['role']}: {message['content']}\n\n" |
|
|
|
chat_transcript += f"Number of tokens used: {num_tokens}\n\n" |
|
|
|
|
|
now_et = datetime.now(timezone(timedelta(hours=-5))) |
|
|
|
published_date = now_et.strftime('%m-%d-%y %H:%M') |
|
|
|
|
|
df = pd.DataFrame([chat_transcript]) |
|
notion_df.upload(df, 'https://www.notion.so/personal-5e3978680ca848bda844452129955138?pvs=4', title=str(published_date), api_key=API_KEY) |
|
|
|
|
|
messages = [initial_message] |
|
answer_count = 0 |
|
|
|
|
|
answer_count += 1 |
|
|
|
|
|
system_message = openai.ChatCompletion.create( |
|
model="gpt-3.5-turbo", |
|
messages=messages, |
|
max_tokens=2000 |
|
)["choices"][0]["message"] |
|
|
|
|
|
|
|
|
|
messages.append({"role": "system", "content": system_message}) |
|
|
|
|
|
chat_transcript = "" |
|
for message in messages: |
|
if message['role'] != 'system': |
|
chat_transcript += f"[ANSWER {answer_count}]{message['role']}: {message['content']}\n\n" |
|
|
|
|
|
num_tokens = num_tokens_from_messages(messages) |
|
chat_transcript += f"Number of tokens used: {num_tokens}\n\n" |
|
|
|
|
|
now_et = datetime.now(timezone(timedelta(hours=-5))) |
|
|
|
published_date = now_et.strftime('%m-%d-%y %H:%M') |
|
|
|
|
|
df = pd.DataFrame([chat_transcript]) |
|
notion_df.upload(df, 'https://www.notion.so/personal-5e3978680ca848bda844452129955138?pvs=4', title=str(published_date), api_key=API_KEY) |
|
|
|
|
|
if num_tokens > 2096: |
|
messages = [initial_message] |
|
answer_count = 0 |
|
else: |
|
|
|
answer_count += 1 |
|
|
|
|
|
system_message = openai.Completion.create( |
|
engine="text-davinci-002", |
|
prompt=[{"text": f"{message['role']}: {message['content']}\n\n"} for message in messages], |
|
temperature=0.7, |
|
max_tokens=2000, |
|
n=1, |
|
stop=None, |
|
)[0]["text"] |
|
|
|
|
|
messages.append({"role": "system", "content": system_message}) |
|
|
|
|
|
audio_input = Audio(source="microphone", type="filepath", label="Record your message") |
|
text_input = Textbox(label="Type your message", max_length=4096) |
|
|
|
output_text = gr.outputs.Textbox(label="Response") |
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=[audio_input, text_input], |
|
outputs="text", |
|
title="YENA", |
|
description="Tutor YENA") |
|
|
|
|
|
iface.launch() |
|
|
|
|