akshatOP commited on
Commit
2f38e4a
·
1 Parent(s): b76224d

Initial upload of TTS, SST, and LLM models with API

Browse files
README.md CHANGED
@@ -10,3 +10,44 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+ # My AI Models Space
14
+
15
+ This Hugging Face Space hosts TTS, SST, and LLM models with API endpoints.
16
+
17
+ ## Setup
18
+
19
+ 1. **Clone the repository** to your Hugging Face Space.
20
+ 2. **Install dependencies**: `pip install -r requirements.txt`.
21
+ 3. **Prepare models**:
22
+ - **TTS**: Run `download_and_finetune_tts.py` externally, then upload `./tts_finetuned` to `models/tts_model`. If not uploaded, uses `parler-tts/parler-tts-mini-v1`.
23
+ - **SST**: Run `download_and_finetune_sst.py` externally, then upload `./sst_finetuned` to `models/sst_model`. If not uploaded, uses `facebook/wav2vec2-base-960h`.
24
+ - **LLM**: Download a Llama GGUF file (e.g., from `TheBloke/Llama-2-7B-GGUF` on Hugging Face Hub) and upload to `models/llama.gguf`. Required for LLM to work.
25
+ 4. **Deploy**: Push to your Space, and it will run `app.py`.
26
+
27
+ ## API Endpoints
28
+
29
+ - **POST /tts**
30
+ - **Request**: `{"text": "Your text here"}`
31
+ - **Response**: Audio file (WAV)
32
+ - **Example**: `curl -X POST -H "Content-Type: application/json" -d '{"text":"Hello"}' http://your-space.hf.space/tts --output output.wav`
33
+
34
+ - **POST /sst**
35
+ - **Request**: Audio file upload
36
+ - **Response**: `{"text": "transcribed text"}`
37
+ - **Example**: `curl -X POST -F "[email protected]" http://your-space.hf.space/sst`
38
+
39
+ - **POST /llm**
40
+ - **Request**: `{"prompt": "Your prompt here"}`
41
+ - **Response**: `{"text": "generated text"}`
42
+ - **Example**: `curl -X POST -H "Content-Type: application/json" -d '{"prompt":"Tell me a story"}' http://your-space.hf.space/llm`
43
+
44
+ ## Fine-Tuning
45
+
46
+ - **TTS**: Edit `download_and_finetune_tts.py` with your dataset, run externally, and upload the result.
47
+ - **SST**: Edit `download_and_finetune_sst.py` with your dataset, run externally, and upload the result.
48
+ - **LLM**: Llama.cpp is used for inference only. For fine-tuning, use tools like LoRA with Transformers externally, convert to GGUF, and upload.
49
+
50
+ ## Notes
51
+
52
+ - Ensure GGUF file for LLM is manageable (e.g., quantized versions like `llama-2-7b.Q4_K_M.gguf`).
53
+ - Fine-tuning requires significant resources; perform it outside Spaces.
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Response
2
+ from transformers import ParlerTTSForConditionalGeneration, AutoTokenizer
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
+ from llama_cpp import Llama
5
+ import torch
6
+ import soundfile as sf
7
+ import io
8
+ import os
9
+ from pydantic import BaseModel
10
+
11
+ app = FastAPI()
12
+
13
+ # Load models
14
+ # TTS: Use local fine-tuned model if available, else load from Hub
15
+ if os.path.exists("./models/tts_model"):
16
+ tts_model = ParlerTTSForConditionalGeneration.from_pretrained("./models/tts_model")
17
+ tts_tokenizer = AutoTokenizer.from_pretrained("./models/tts_model")
18
+ else:
19
+ tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1")
20
+ tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
21
+
22
+ # SST: Use local fine-tuned model if available, else load from Hub
23
+ if os.path.exists("./models/sst_model"):
24
+ sst_model = Wav2Vec2ForCTC.from_pretrained("./models/sst_model")
25
+ sst_processor = Wav2Vec2Processor.from_pretrained("./models/sst_model")
26
+ else:
27
+ sst_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
28
+ sst_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
29
+
30
+ # LLM: Use local GGUF file if available, else raise error (must be uploaded)
31
+ if os.path.exists("./models/llama.gguf"):
32
+ llm = Llama("./models/llama.gguf")
33
+ else:
34
+ raise FileNotFoundError("Please upload llama.gguf to models/ directory")
35
+
36
+ # Request models
37
+ class TTSRequest(BaseModel):
38
+ text: str
39
+
40
+ class LLMRequest(BaseModel):
41
+ prompt: str
42
+
43
+ # API Endpoints
44
+ @app.post("/tts")
45
+ async def tts_endpoint(request: TTSRequest):
46
+ """Convert text to speech and return audio."""
47
+ text = request.text
48
+ inputs = tts_tokenizer(text, return_tensors="pt")
49
+ with torch.no_grad():
50
+ audio = tts_model.generate(**inputs)
51
+ audio = audio.squeeze().cpu().numpy()
52
+ buffer = io.BytesIO()
53
+ sf.write(buffer, audio, 22050, format="WAV")
54
+ buffer.seek(0)
55
+ return Response(content=buffer.getvalue(), media_type="audio/wav")
56
+
57
+ @app.post("/sst")
58
+ async def sst_endpoint(file: UploadFile = File(...)):
59
+ """Convert speech to text and return transcription."""
60
+ audio_bytes = await file.read()
61
+ audio, sr = sf.read(io.BytesIO(audio_bytes))
62
+ inputs = sst_processor(audio, sampling_rate=sr, return_tensors="pt")
63
+ with torch.no_grad():
64
+ logits = sst_model(inputs.input_values).logits
65
+ predicted_ids = torch.argmax(logits, dim=-1)
66
+ transcription = sst_processor.batch_decode(predicted_ids)[0]
67
+ return {"text": transcription}
68
+
69
+ @app.post("/llm")
70
+ async def llm_endpoint(request: LLMRequest):
71
+ """Generate text from a prompt using Llama.cpp."""
72
+ prompt = request.prompt
73
+ output = llm(prompt, max_tokens=50)
74
+ return {"text": output["choices"][0]["text"]}
download_and_finetune_sst.py ADDED
File without changes
download_and_finetune_tts.py ADDED
File without changes
models/sst_model/download_and_finetune_sst.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ # Download model
5
+ model_name = "facebook/wav2vec2-base-960h"
6
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
7
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
8
+
9
+ # Load dataset (replace with your dataset)
10
+ dataset = load_dataset("librispeech_asr", "clean", split="train.100") # Example dataset
11
+
12
+ # Preprocess function
13
+ def preprocess_function(examples):
14
+ audio = examples["audio"]
15
+ inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt", padding=True)
16
+ with processor.as_target_processor():
17
+ labels = processor(examples["text"], return_tensors="pt", padding=True)
18
+ return {
19
+ "input_values": inputs["input_values"][0],
20
+ "labels": labels["input_ids"][0]
21
+ }
22
+
23
+ train_dataset = dataset.map(preprocess_function, remove_columns=dataset.column_names)
24
+
25
+ # Training arguments
26
+ training_args = TrainingArguments(
27
+ output_dir="./sst_finetuned",
28
+ per_device_train_batch_size=8,
29
+ num_train_epochs=3,
30
+ save_steps=500,
31
+ logging_steps=10,
32
+ )
33
+
34
+ # Initialize Trainer
35
+ trainer = Trainer(
36
+ model=model,
37
+ args=training_args,
38
+ train_dataset=train_dataset,
39
+ )
40
+
41
+ # Fine-tune
42
+ trainer.train()
43
+
44
+ # Save fine-tuned model
45
+ trainer.save_model("./sst_finetuned")
46
+ processor.save_pretrained("./sst_finetuned")
47
+
48
+ print("SST model fine-tuned and saved to './sst_finetuned'. Upload to models/sst_model in your Space.")
models/tts_model/download_and_finetune_tts.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import ParlerTTSForConditionalGeneration, AutoTokenizer, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ # Download model
5
+ model_name = "parler-tts/parler-tts-mini-v1"
6
+ model = ParlerTTSForConditionalGeneration.from_pretrained(model_name)
7
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+
9
+ # Load dataset (replace with your dataset)
10
+ dataset = load_dataset("lj_speech") # Example dataset; adjust as needed
11
+
12
+ # Preprocess function (customize based on your dataset)
13
+ def preprocess_function(examples):
14
+ # Tokenize text and prepare audio (example; adjust for your data)
15
+ inputs = tokenizer(examples["text"], return_tensors="pt", padding=True, truncation=True)
16
+ # Add audio processing if needed
17
+ return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]}
18
+
19
+ train_dataset = dataset["train"].map(preprocess_function, batched=True)
20
+
21
+ # Training arguments
22
+ training_args = TrainingArguments(
23
+ output_dir="./tts_finetuned",
24
+ per_device_train_batch_size=8,
25
+ num_train_epochs=3,
26
+ save_steps=500,
27
+ logging_steps=10,
28
+ )
29
+
30
+ # Initialize Trainer
31
+ trainer = Trainer(
32
+ model=model,
33
+ args=training_args,
34
+ train_dataset=train_dataset,
35
+ )
36
+
37
+ # Fine-tune
38
+ trainer.train()
39
+
40
+ # Save fine-tuned model
41
+ trainer.save_model("./tts_finetuned")
42
+ tokenizer.save_pretrained("./tts_finetuned")
43
+
44
+ print("TTS model fine-tuned and saved to './tts_finetuned'. Upload to models/tts_model in your Space.")
nuera/models/llama.gguf ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers
4
+ torch
5
+ soundfile
6
+ numpy
7
+ llama-cpp-python
8
+ pydantic
9
+ datasets