Spaces:

CohereLabs
/

aya_expanse

Running on T4

App Files Files Community

shivalikasingh

shivalika commited on 6 days ago

Commit

dbb8f6f

verified ·

1 Parent(s): befde28

Update code (#9)

Browse files

- update code (d281662a0810be41652a2f14d20401324325c29f)
- add fix (f14e66e1c82b6e7d51eea0b0e4254485ee737490)

Co-authored-by: Shivalika Singh <[email protected]>

Files changed (3) hide show

app.py +52 -15
aya_vision_utils.py +57 -9
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -25,11 +25,14 @@ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from prompt_examples import TEXT_CHAT_EXAMPLES, IMG_GEN_PROMPT_EXAMPLES, AUDIO_EXAMPLES, TEXT_CHAT_EXAMPLES_LABELS, IMG_GEN_PROMPT_EXAMPLES_LABELS, AUDIO_EXAMPLES_LABELS, AYA_VISION_PROMPT_EXAMPLES
 from preambles import CHAT_PREAMBLE, AUDIO_RESPONSE_PREAMBLE, IMG_DESCRIPTION_PREAMBLE
 from constants import LID_LANGUAGES, NEETS_AI_LANGID_MAP, AYA_MODEL_NAME, BATCH_SIZE, USE_ELVENLABS, USE_REPLICATE
-from aya_vision_utils import get_aya_vision_response, get_aya_vision_prompt_example
 # from dotenv import load_dotenv
 # load_dotenv()
 HF_API_TOKEN =  os.getenv("HF_API_KEY")
 ELEVEN_LABS_KEY = os.getenv("ELEVEN_LABS_KEY")
 NEETS_AI_API_KEY = os.getenv("NEETS_AI_API_KEY")
@@ -62,6 +65,17 @@ eleven_labs_client = ElevenLabs(
   api_key=ELEVEN_LABS_KEY,
 )
 # Language identification
 lid_model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
 LID_model = fasttext.load_model(lid_model_path)
@@ -102,20 +116,34 @@ def replicate_api_inference(input_prompt):
     image = Image.open(image[0])
     return image
-def generate_image(input_prompt, model_id="black-forest-labs/FLUX.1-schnell"):
-    if input_prompt:
         if USE_REPLICATE:
             print("using replicate for image generation")
-            image = replicate_api_inference(input_prompt)
         else:
             try:
                 print("using HF inference API for image generation")
-                image_bytes = get_hf_inference_api_response({ "inputs": input_prompt}, model_id)
                 image = np.array(Image.open(io.BytesIO(image_bytes)))
             except Exception as e:
                 print("HF API error:", e)
                 # generate image with help replicate in case of error
-                image = replicate_api_inference(input_prompt)
         return image
     else:
         return None
@@ -246,7 +274,7 @@ def clean_text(text, remove_bullets=False, remove_newline=False):
     return cleaned_text
-def convert_text_to_speech(text, language="english"):
     # do language detection to determine voice of speech response
     if text:
@@ -268,19 +296,28 @@ def convert_text_to_speech(text, language="english"):
         else:
             # use elevenlabs for TTS
             audio_path = elevenlabs_generate_audio(text)
         return audio_path
     else:
         return None
 def elevenlabs_generate_audio(text):
-    audio = eleven_labs_client.generate(
         text=text,
-        voice="River",
-        model="eleven_turbo_v2_5", #"eleven_multilingual_v2"
-        )
-    # save audio
-    audio_path = "./audio.mp3"
     save(audio, audio_path)
     return audio_path
@@ -534,7 +571,7 @@ with demo:
     generated_img_desc.change(
         generate_image, #run_flux,
-        inputs=[generated_img_desc],
         outputs=[generated_img],
         show_progress="full",
     )
@@ -558,7 +595,7 @@ with demo:
         show_progress="full",
     ).then(
         convert_text_to_speech,
-        inputs=[e2e_audio_file_aya_response],
         outputs=[e2e_aya_audio_response],
         show_progress="full",
     )

 from prompt_examples import TEXT_CHAT_EXAMPLES, IMG_GEN_PROMPT_EXAMPLES, AUDIO_EXAMPLES, TEXT_CHAT_EXAMPLES_LABELS, IMG_GEN_PROMPT_EXAMPLES_LABELS, AUDIO_EXAMPLES_LABELS, AYA_VISION_PROMPT_EXAMPLES
 from preambles import CHAT_PREAMBLE, AUDIO_RESPONSE_PREAMBLE, IMG_DESCRIPTION_PREAMBLE
 from constants import LID_LANGUAGES, NEETS_AI_LANGID_MAP, AYA_MODEL_NAME, BATCH_SIZE, USE_ELVENLABS, USE_REPLICATE
+from aya_vision_utils import get_aya_vision_response, get_aya_vision_prompt_example, insert_aya_audio, insert_aya_image, connect_with_connector
+from google.cloud import storage
 # from dotenv import load_dotenv
 # load_dotenv()
 HF_API_TOKEN =  os.getenv("HF_API_KEY")
 ELEVEN_LABS_KEY = os.getenv("ELEVEN_LABS_KEY")
 NEETS_AI_API_KEY = os.getenv("NEETS_AI_API_KEY")
   api_key=ELEVEN_LABS_KEY,
 )
+BUCKET_NAME = os.getenv("BUCKET_NAME")
+AUDIO_BUCKET = os.getenv("AUDIO_BUCKET")
+IMAGE_STORAGE_PATH = os.getenv("IMAGE_STORAGE_PATH")
+AUDIO_STORAGE_PATH = os.getenv("AUDIO_STORAGE_PATH")
+SAVING_ENABLED = True
+storage_client = storage.Client()
+bucket = storage_client.bucket(BUCKET_NAME)
+audio_bucket = storage_client.bucket(AUDIO_BUCKET)
+connection = connect_with_connector()
 # Language identification
 lid_model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
 LID_model = fasttext.load_model(lid_model_path)
     image = Image.open(image[0])
     return image
+def generate_image(input_prompt, generated_img_desc, model_id="black-forest-labs/FLUX.1-schnell"):
+    if input_prompt and generated_img_desc:
         if USE_REPLICATE:
             print("using replicate for image generation")
+            image = replicate_api_inference(generated_img_desc)
         else:
             try:
                 print("using HF inference API for image generation")
+                image_bytes = get_hf_inference_api_response({ "inputs": generated_img_desc}, model_id)
                 image = np.array(Image.open(io.BytesIO(image_bytes)))
             except Exception as e:
                 print("HF API error:", e)
                 # generate image with help replicate in case of error
+                image = replicate_api_inference(generated_img_desc)
+        # save image to local file
+        image_path = "generated_image.png"
+        image.save(image_path)
+        if SAVING_ENABLED:
+            unique_id = str(uuid.uuid4())
+            blob = bucket.blob(IMAGE_STORAGE_PATH + unique_id + "_" + image_path)
+            blob.upload_from_filename(image_path)
+            gcp_image_path = f"gs://{BUCKET_NAME}/{IMAGE_STORAGE_PATH}{unique_id}_{image_path}"
+            insert_aya_image(connection, input_prompt, generated_img_desc, gcp_image_path)
         return image
     else:
         return None
     return cleaned_text
+def convert_text_to_speech(transcript, text, language="english"):
     # do language detection to determine voice of speech response
     if text:
         else:
             # use elevenlabs for TTS
             audio_path = elevenlabs_generate_audio(text)
+        if SAVING_ENABLED:
+            unique_id = str(uuid.uuid4())
+            blob = audio_bucket.blob(AUDIO_STORAGE_PATH + unique_id + "_" + audio_path)
+            blob.upload_from_filename(audio_path)
+            gcp_audio_path = f"gs://{BUCKET_NAME}/{AUDIO_STORAGE_PATH}{unique_id}_{audio_path}"
+            insert_aya_audio(connection, transcript, text, gcp_audio_path)
         return audio_path
     else:
         return None
 def elevenlabs_generate_audio(text):
+    audio = eleven_labs_client.text_to_speech.convert(
         text=text,
+        voice_id="21m00Tcm4TlvDq8ikWAM", #Rachel
+        model_id="eleven_multilingual_v2",
+        output_format="mp3_44100_128",
+    )
+    audio_path = "audio.mp3"
     save(audio, audio_path)
     return audio_path
     generated_img_desc.change(
         generate_image, #run_flux,
+        inputs=[input_img_prompt, generated_img_desc],
         outputs=[generated_img],
         show_progress="full",
     )
         show_progress="full",
     ).then(
         convert_text_to_speech,
+        inputs=[e2e_audio_file_trans, e2e_audio_file_aya_response],
         outputs=[e2e_aya_audio_response],
         show_progress="full",
     )

aya_vision_utils.py CHANGED Viewed

@@ -9,7 +9,10 @@ import os
 import traceback
 import random
 import gradio as gr
 # from dotenv import load_dotenv
 # load_dotenv()
@@ -32,9 +35,6 @@ def cohere_vision_chat(chat_history, model=VISION_COHERE_MODEL_NAME):
 def get_aya_vision_prompt_example(language):
     example = AYA_VISION_PROMPT_EXAMPLES[language]
-    print("example:", example)
-    print("example prompt:", example[0])
-    print("example image:", example[1])
     return example[0], example[1]
 def get_base64_from_local_file(file_path):
@@ -42,7 +42,6 @@ def get_base64_from_local_file(file_path):
         print("loading image")
         with open(file_path, "rb") as image_file:
             base64_image = base64.b64encode(image_file.read()).decode('utf-8')
-            print("converted image")
         return base64_image
     except Exception as e:
         logger.debug(f"Error converting local image to base64 string: {e}")
@@ -50,8 +49,6 @@ def get_base64_from_local_file(file_path):
 def get_aya_vision_response(incoming_message, image_filepath, max_size_mb=5):
-    print("incoming message:", incoming_message)
-    print("image_filepath:", image_filepath)
     max_size_bytes = max_size_mb * 1024 * 1024
     image_ext = image_filepath.lower()
@@ -70,7 +67,6 @@ def get_aya_vision_response(incoming_message, image_filepath, max_size_mb=5):
     print("converting image to base 64")
     base64_image = get_base64_from_local_file(image_filepath)
     image = f"data:{image_type};base64,{base64_image}"
-    print("Image base64:", image[:30])
     # to prevent Cohere API from throwing error for empty message
     if incoming_message=="" or incoming_message is None:
@@ -108,4 +104,56 @@ def get_base64_image_size(base64_string):
     base64_data = base64_data.replace('\n', '').replace('\r', '').replace(' ', '')
     padding = base64_data.count('=')
     size_bytes = (len(base64_data) * 3) // 4 - padding
-    return size_bytes

 import traceback
 import random
 import gradio as gr
+from google.cloud.sql.connector import Connector, IPTypes
+import pg8000
+from datetime import datetime
+import sqlalchemy
 # from dotenv import load_dotenv
 # load_dotenv()
 def get_aya_vision_prompt_example(language):
     example = AYA_VISION_PROMPT_EXAMPLES[language]
     return example[0], example[1]
 def get_base64_from_local_file(file_path):
         print("loading image")
         with open(file_path, "rb") as image_file:
             base64_image = base64.b64encode(image_file.read()).decode('utf-8')
         return base64_image
     except Exception as e:
         logger.debug(f"Error converting local image to base64 string: {e}")
 def get_aya_vision_response(incoming_message, image_filepath, max_size_mb=5):
     max_size_bytes = max_size_mb * 1024 * 1024
     image_ext = image_filepath.lower()
     print("converting image to base 64")
     base64_image = get_base64_from_local_file(image_filepath)
     image = f"data:{image_type};base64,{base64_image}"
     # to prevent Cohere API from throwing error for empty message
     if incoming_message=="" or incoming_message is None:
     base64_data = base64_data.replace('\n', '').replace('\r', '').replace(' ', '')
     padding = base64_data.count('=')
     size_bytes = (len(base64_data) * 3) // 4 - padding
+    return size_bytes
+def insert_aya_audio(connection, user_prompt, text_response, audio_response_file_path):
+    with connection.begin():
+        connection.execute(
+            sqlalchemy.text("""
+                INSERT INTO aya_audio (user_prompt, text_response, audio_response_file_path, timestamp)
+                VALUES (:user_prompt, :text_response, :audio_response_file_path, :timestamp)
+            """),
+            {"user_prompt": user_prompt, "text_response": text_response, "audio_response_file_path": audio_response_file_path, "timestamp": datetime.now()}
+        )
+def insert_aya_image(connection, user_prompt, generated_img_desc, image_response_file_path):
+    with connection.begin():
+        connection.execute(
+            sqlalchemy.text("""
+                INSERT INTO aya_image (user_prompt, generated_img_desc, image_response_file_path, timestamp)
+                VALUES (:user_prompt, :generated_img_desc, :image_response_file_path, :timestamp)
+            """),
+            {"user_prompt": user_prompt, "generated_img_desc": generated_img_desc, "image_response_file_path": image_response_file_path, "timestamp": datetime.now()}
+        )
+def connect_with_connector() -> sqlalchemy.engine.base.Engine:
+    instance_connection_name = os.environ[
+        "INSTANCE_CONNECTION_NAME"
+    ]
+    db_user = os.environ["DB_USER"]
+    db_pass = os.environ["DB_PASS"]
+    db_name = os.environ["DB_NAME"]
+    ip_type = IPTypes.PRIVATE if os.environ.get("PRIVATE_IP") else IPTypes.PUBLIC
+    connector = Connector(refresh_strategy="LAZY")
+    def getconn() -> pg8000.dbapi.Connection:
+        conn: pg8000.dbapi.Connection = connector.connect(
+            instance_connection_name,
+            "pg8000",
+            user=db_user,
+            password=db_pass,
+            db=db_name,
+            ip_type=ip_type,
+        )
+        return conn
+    pool = sqlalchemy.create_engine(
+        "postgresql+pg8000://",
+        creator=getconn,
+    )
+    connection = pool.connect()
+    return connection

requirements.txt CHANGED Viewed

@@ -10,4 +10,8 @@ groq
 replicate
 fasttext
 cutlet
-fugashi[unidic-lite]

 replicate
 fasttext
 cutlet
+fugashi[unidic-lite]
+python-dotenv
+SQLAlchemy
+google-cloud-storage
+cloud-sql-python-connector[pg8000]