|
import openai |
|
import gradio as gr |
|
from gradio.components import Audio, Textbox |
|
import os |
|
import re |
|
import tiktoken |
|
from transformers import GPT2Tokenizer |
|
import whisper |
|
import pandas as pd |
|
from datetime import datetime, timezone, timedelta |
|
import notion_df |
|
import concurrent.futures |
|
import nltk |
|
from nltk.tokenize import sent_tokenize |
|
nltk.download('punkt') |
|
import spacy |
|
from spacy import displacy |
|
from gradio import Markdown |
|
import threading |
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') |
|
model = openai.api_key = os.environ["OAPI_KEY"] |
|
|
|
|
|
initialt = 'You are a USMLE Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) \ |
|
to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response. \ |
|
You are going to keep answer and also challenge the student to learn USMLE anatomy, phsysiology, and pathology.' |
|
initial_message = {"role": "system", "content": initialt} |
|
messages = [initial_message] |
|
messages_rev = [initial_message] |
|
|
|
|
|
answer_count = 0 |
|
|
|
|
|
API_KEY = os.environ["NAPI_KEY"] |
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
def process_nlp(system_message): |
|
|
|
colorized_text = colorize_text(system_message['content']) |
|
return colorized_text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
COLORS = { |
|
"NOUN": "#FF3300", |
|
"VERB": "#008000", |
|
"ADJ": "#1E90FF", |
|
"ADV": "#FF8C00", |
|
"digit": "#FF1493", |
|
"punct": "#8B0000", |
|
"quote": "#800080", |
|
} |
|
|
|
|
|
DYSLEXIA_COLORS = { |
|
"NOUN": "#1E90FF", |
|
"VERB": "#006400", |
|
"ADJ": "#00CED1", |
|
"ADV": "#FF8C00", |
|
"digit": "#FF1493", |
|
"punct": "#A0522D", |
|
"quote": "#800080", |
|
} |
|
|
|
|
|
BACKGROUND_COLOR = "#EAEAEA" |
|
|
|
|
|
FONT = "Georgia" |
|
FONT_SIZE = "18px" |
|
|
|
def colorize_text(text, colors=None, background_color=None): |
|
if colors is None: |
|
colors = COLORS |
|
colorized_text = "" |
|
lines = text.split("\n") |
|
|
|
|
|
if background_color is None: |
|
background_color = BACKGROUND_COLOR |
|
|
|
for line in lines: |
|
doc = nlp(line) |
|
for token in doc: |
|
if token.ent_type_: |
|
|
|
if colors == COLORS: |
|
color = DYSLEXIA_COLORS.get(token.pos_, None) |
|
else: |
|
color = colors.get(token.pos_, None) |
|
if color is not None: |
|
colorized_text += ( |
|
f'<span style="color: {color}; ' |
|
f'background-color: {background_color}; ' |
|
f'font-family: {FONT}; ' |
|
f'font-size: {FONT_SIZE}; ' |
|
f'text-decoration: underline;">' |
|
f"{token.text}</span>" |
|
) |
|
else: |
|
colorized_text += ( |
|
f'<span style="font-family: {FONT}; ' |
|
f'font-size: {FONT_SIZE}; ' |
|
f'text-decoration: underline;">' |
|
f"{token.text}</span>" |
|
) |
|
else: |
|
color = colors.get(token.pos_, None) |
|
if color is not None: |
|
colorized_text += ( |
|
f'<span style="color: {color}; ' |
|
f'background-color: {background_color}; ' |
|
f'font-family: {FONT}; ' |
|
f'font-size: {FONT_SIZE}; ' |
|
f'text-decoration: underline;">' |
|
f"{token.text}</span>" |
|
) |
|
elif token.is_digit: |
|
colorized_text += ( |
|
f'<span style="color: {colors["digit"]}; ' |
|
f'background-color: {background_color}; ' |
|
f'font-family: {FONT}; ' |
|
f'font-size: {FONT_SIZE}; ' |
|
f'text-decoration: underline;">' |
|
f"{token.text}</span>" |
|
) |
|
elif token.is_punct: |
|
colorized_text += ( |
|
f'<span style="color: {colors["punct"]}; ' |
|
f'background-color: {background_color}; ' |
|
f'font-family: {FONT}; ' |
|
f'font-size: {FONT_SIZE}; ' |
|
f'text-decoration: underline;">' |
|
f"{token.text}</span>" |
|
) |
|
elif token.is_quote: |
|
colorized_text += ( |
|
f'<span style="color: {colors["quote"]}; ' |
|
f'background-color: {background_color}; ' |
|
f'font-family: {FONT}; ' |
|
f'font-size: {FONT_SIZE}; ' |
|
f'text-decoration: underline;">' |
|
f"{token.text}</span>" |
|
) |
|
else: |
|
colorized_text += ( |
|
f'<span style="font-family: {FONT}; ' |
|
f'font-size: {FONT_SIZE}; ' |
|
f'text-decoration: underline;">' |
|
f"{token.text}</span>" |
|
) |
|
colorized_text += " " |
|
colorized_text += "<br>" |
|
return colorized_text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def colorize_and_update(system_message, submit_update): |
|
colorized_system_message = colorize_text(system_message['content']) |
|
submit_update(None, colorized_system_message) |
|
|
|
def update_text_output(system_message, submit_update): |
|
submit_update(system_message['content'], None) |
|
|
|
|
|
def train(text): |
|
now_et = datetime.now(timezone(timedelta(hours=-4))) |
|
published_date = now_et.strftime('%m-%d-%y %H:%M') |
|
df = pd.DataFrame([text]) |
|
notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY) |
|
|
|
|
|
def transcribe(audio, text, submit_update=None): |
|
global messages |
|
global answer_count |
|
transcript = {'text': ''} |
|
input_text = [] |
|
|
|
|
|
if text and text.split("\n")[0].split(" ")[0].strip().upper() == "COLORIZE": |
|
train(text) |
|
colorized_input = colorize_text(text) |
|
return text, colorized_input |
|
|
|
|
|
if audio is not None: |
|
audio_file = open(audio, "rb") |
|
transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en") |
|
|
|
|
|
if text is not None: |
|
|
|
sentences = re.split("(?<=[.!?]) +", text) |
|
|
|
|
|
input_tokens = [] |
|
|
|
|
|
for sentence in sentences: |
|
|
|
sentence_tokens = tokenizer.encode(sentence) |
|
|
|
if len(input_tokens) + len(sentence_tokens) < 1440: |
|
|
|
input_tokens.extend(sentence_tokens) |
|
else: |
|
|
|
sentence_tokens = sentence_tokens[:1440-len(input_tokens)] |
|
input_tokens.extend(sentence_tokens) |
|
break |
|
|
|
input_text = tokenizer.decode(input_tokens) |
|
|
|
|
|
messages.append({"role": "user", "content": transcript["text"]+input_text}) |
|
|
|
|
|
|
|
num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages) |
|
if num_tokens > 2096: |
|
|
|
chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'system']) |
|
|
|
|
|
chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n" |
|
|
|
|
|
now_et = datetime.now(timezone(timedelta(hours=-4))) |
|
|
|
published_date = now_et.strftime('%m-%d-%y %H:%M') |
|
|
|
|
|
df = pd.DataFrame([chat_transcript]) |
|
notion_df.upload(df, 'https://www.notion.so/YENA-be569d0a40c940e7b6e0679318215790?pvs=4', title=str(published_date+'back_up'), api_key=API_KEY) |
|
|
|
|
|
messages = [initial_message] |
|
messages.append({"role": "user", "content": initialt}) |
|
answer_count = 0 |
|
|
|
messages.append({"role": "user", "content": input_text}) |
|
else: |
|
|
|
answer_count += 1 |
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
prompt = [{"text": f"{message['role']}: {message['content']}\n\n"} for message in messages] |
|
system_message = openai.ChatCompletion.create( |
|
|
|
model="gpt-4", |
|
messages=messages, |
|
max_tokens=2000 |
|
)["choices"][0]["message"] |
|
|
|
|
|
if submit_update: |
|
update_text_output(system_message, submit_update) |
|
|
|
|
|
messages.append(system_message) |
|
|
|
|
|
messages_rev.insert(0, system_message) |
|
|
|
messages_rev.insert(0, {"role": "user", "content": input_text + transcript["text"]}) |
|
|
|
|
|
if submit_update: |
|
colorize_thread = threading.Thread(target=colorize_and_update, args=(system_message, submit_update)) |
|
colorize_thread.start() |
|
|
|
|
|
chat_transcript = system_message['content'] |
|
|
|
|
|
chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'system']) |
|
|
|
|
|
chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n" |
|
|
|
now_et = datetime.now(timezone(timedelta(hours=-4))) |
|
published_date = now_et.strftime('%m-%d-%y %H:%M') |
|
df = pd.DataFrame([chat_transcript]) |
|
notion_df.upload(df, 'https://www.notion.so/YENA-be569d0a40c940e7b6e0679318215790?pvs=4', title=str(published_date), api_key=API_KEY) |
|
|
|
|
|
return system_message['content'], colorize_text(system_message['content']) |
|
|
|
|
|
|
|
audio_input = Audio(source="microphone", type="filepath", label="Record your message") |
|
text_input = Textbox(label="Type your message", max_length=4096) |
|
output_text = Textbox(label="Text Output") |
|
output_html = Markdown() |
|
|
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=[audio_input, text_input], |
|
outputs=[output_text, output_html], |
|
title="Hold On, Pain Ends (HOPE)", |
|
description="Talk to Your USMLE Tutor HOPE. \n If you want to colorize your note, type COLORIZE in the first line of your input.", |
|
theme="compact", |
|
layout="vertical", |
|
allow_flagging=False |
|
) |
|
|
|
|
|
iface.launch() |