Spaces:

desiree
/

Qwen2-Audio-7B

Running on Zero

App Files Files Community

desiree commited on Dec 4, 2024

Commit

56709e2

verified ·

1 Parent(s): b50b5f5

Upload app.py

Browse files

Files changed (1) hide show

app.py +38 -9

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ MODEL_ID = "Qwen/Qwen-Audio-Chat"
 QWEN_CHAT_TEMPLATE = """{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] }}{% endif %}{{ eos_token }}{% endfor %}"""
 def load_model():
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         torch_dtype=torch.float16,
@@ -21,14 +22,17 @@ def load_model():
     )
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
     tokenizer.chat_template = QWEN_CHAT_TEMPLATE
     return model, tokenizer
 def process_audio(audio_path):
     """Process audio file and return the appropriate format for the model."""
     try:
         audio_data, sample_rate = sf.read(audio_path)
         if len(audio_data.shape) > 1:
             audio_data = audio_data.mean(axis=1)  # Convert stereo to mono if necessary
         return True
     except Exception as e:
         print(f"Error processing audio: {e}")
@@ -44,6 +48,8 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
     Returns:
         str: Model's response about the audio
     """
     # Input validation
     if audio_path is None or not isinstance(audio_path, str):
         return "Please provide a valid audio file."
@@ -54,35 +60,58 @@ def analyze_audio(audio_path: str, question: str = None) -> str:
     if not process_audio(audio_path):
         return "Failed to process the audio file. Please ensure it's a valid audio format."
-    model, tokenizer = load_model()
-    query = question if question else "Please describe what you hear in this audio clip."
     try:
         messages = [
             {
                 "role": "user",
-                "content": f"<audio>{audio_path}</audio>{query}"
             }
         ]
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
         )
         model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
         with torch.no_grad():
             outputs = model.generate(
                 **model_inputs,
                 max_new_tokens=512,
                 temperature=0.7,
-                do_sample=True
             )
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return response
     except Exception as e:
         return f"An error occurred while processing: {str(e)}"
 # Create Gradio interface with clear input/output specifications

 QWEN_CHAT_TEMPLATE = """{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] }}{% endif %}{{ eos_token }}{% endfor %}"""
 def load_model():
+    print("Loading model and tokenizer...")
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         torch_dtype=torch.float16,
     )
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
     tokenizer.chat_template = QWEN_CHAT_TEMPLATE
+    print("Model and tokenizer loaded successfully")
     return model, tokenizer
 def process_audio(audio_path):
     """Process audio file and return the appropriate format for the model."""
     try:
+        print(f"Processing audio file: {audio_path}")
         audio_data, sample_rate = sf.read(audio_path)
         if len(audio_data.shape) > 1:
             audio_data = audio_data.mean(axis=1)  # Convert stereo to mono if necessary
+        print(f"Audio processed successfully. Sample rate: {sample_rate}, Shape: {audio_data.shape}")
         return True
     except Exception as e:
         print(f"Error processing audio: {e}")
     Returns:
         str: Model's response about the audio
     """
+    print(f"\nStarting analysis with audio_path: {audio_path}, question: {question}")
     # Input validation
     if audio_path is None or not isinstance(audio_path, str):
         return "Please provide a valid audio file."
     if not process_audio(audio_path):
         return "Failed to process the audio file. Please ensure it's a valid audio format."
     try:
+        model, tokenizer = load_model()
+        query = question if question else "Please describe what you hear in this audio clip."
+        print("Preparing messages...")
         messages = [
             {
                 "role": "user",
+                "content": f"Here is an audio clip: <audio>{audio_path}</audio>\n{query}"
             }
         ]
+        print("Applying chat template...")
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
         )
+        print(f"Generated prompt text: {text[:200]}...")  # Print first 200 chars of prompt
+        print("Tokenizing input...")
         model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+        print("Generating response...")
         with torch.no_grad():
             outputs = model.generate(
                 **model_inputs,
                 max_new_tokens=512,
                 temperature=0.7,
+                do_sample=True,
+                pad_token_id=tokenizer.pad_token_id,
+                bos_token_id=tokenizer.bos_token_id,
+                eos_token_id=tokenizer.eos_token_id
             )
+            if outputs is None:
+                print("Model generated None output")
+                return "The model failed to generate a response. Please try again."
+            print(f"Output shape: {outputs.shape}")
+            if len(outputs.shape) != 2 or outputs.shape[0] == 0:
+                print(f"Unexpected output shape: {outputs.shape}")
+                return "The model generated an invalid response. Please try again."
+            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            print(f"Generated response: {response[:200]}...")  # Print first 200 chars of response
+            return response
     except Exception as e:
+        print(f"Error during processing: {str(e)}")
+        import traceback
+        traceback.print_exc()
         return f"An error occurred while processing: {str(e)}"
 # Create Gradio interface with clear input/output specifications