Spaces:

Adeenakk
/

urduvoice

Runtime error

App Files Files Community

Adeenakk commited on Aug 31, 2024

Commit

2d5eff6

verified ·

1 Parent(s): 771bfb5

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -21

app.py CHANGED Viewed

@@ -6,10 +6,14 @@ import requests
 from gtts import gTTS
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 from pydub import AudioSegment
-# Load environment variables for API keys
-RAPIDAPI_KEY = os.getenv('RAPIDAPI_LANG_TRANS')
-GROQ_API_KEY = os.getenv('GROQ_API_KEY')
 # Load the Whisper model
 processor = AutoProcessor.from_pretrained("ihanif/whisper-medium-urdu")
@@ -43,10 +47,21 @@ def process_audio(file_path):
         # Convert audio to numpy array for processing
         audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0  # Normalize to [-1, 1] range
         audio_input = processor(audio_samples, return_tensors="pt", sampling_rate=16000)
         # Transcribe the audio using the fine-tuned Whisper model
-        result = model.generate(**audio_input)
         text = processor.batch_decode(result, skip_special_tokens=True)[0]
         if not text.strip():  # Check if the transcribed text is empty
@@ -58,22 +73,15 @@ def process_audio(file_path):
         urdu_to_eng = translate("en", text)
         print(f"Translated Text (English): {urdu_to_eng}")  # Debugging step
-        # Make API call to Groq
-        groq_url = "https://api.groq.com/your-endpoint"  # Replace with actual Groq API endpoint
-        groq_headers = {
-            "Authorization": f"Bearer {GROQ_API_KEY}",
-            "Content-Type": "application/json"
-        }
-        groq_payload = {
-            "messages": [{"role": "user", "content": urdu_to_eng}],
-            "model": "llama3-8b-8192",  # Adjust model if needed
-            "max_tokens": 50
-        }
-        response = requests.post(groq_url, json=groq_payload, headers=groq_headers)
-        chat_completion = response.json()
-        # Access the response
-        response_message = chat_completion["choices"][0]["message"]["content"].strip()
         print(f"Groq Response (English): {response_message}")  # Debugging step
         # Translate the response text back to Urdu
@@ -104,4 +112,4 @@ iface = gr.Interface(
     live=True
 )
-iface.launch()

 from gtts import gTTS
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 from pydub import AudioSegment
+from groq import Groq
+from google.colab import userdata
+RAPIDAPI_KEY = userdata.get('RAPIDAPI_LANG_TRANS')
+GROQ_API_KEY = userdata.get('GROQ_API_KEY')
+# Initialize the Groq client
+client = Groq(api_key=GROQ_API_KEY)
 # Load the Whisper model
 processor = AutoProcessor.from_pretrained("ihanif/whisper-medium-urdu")
         # Convert audio to numpy array for processing
         audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0  # Normalize to [-1, 1] range
+        # Create attention mask
+        # Assume padding length is determined by the maximum length of sequences
+        # For simplicity, we'll just create a mask where all values are 1 (no padding)
+        # In practice, you would adjust this based on actual sequence length
+        attention_mask = np.ones_like(audio_samples, dtype=np.int64)
         audio_input = processor(audio_samples, return_tensors="pt", sampling_rate=16000)
         # Transcribe the audio using the fine-tuned Whisper model
+        # Pass the attention mask as well
+        result = model.generate(
+            **audio_input,
+            attention_mask=torch.tensor(attention_mask).unsqueeze(0)  # Add batch dimension
+        )
         text = processor.batch_decode(result, skip_special_tokens=True)[0]
         if not text.strip():  # Check if the transcribed text is empty
         urdu_to_eng = translate("en", text)
         print(f"Translated Text (English): {urdu_to_eng}")  # Debugging step
+        # Generate a response using Groq
+        chat_completion = client.chat.completions.create(
+            messages=[{"role": "user", "content": urdu_to_eng}],
+            model="llama3-8b-8192",  # Ensure the model supports Urdu if possible
+            max_tokens=50
+        )
+        # Access the response using dot notation
+        response_message = chat_completion.choices[0].message.content.strip()
         print(f"Groq Response (English): {response_message}")  # Debugging step
         # Translate the response text back to Urdu
     live=True
 )
+iface.launch(share=True)