initial

2026-03-04 22:21:47 +01:00
commit a86357c190
6 changed files with 312 additions and 0 deletions
@@ -0,0 +1,26 @@
+# Python environment
+__pycache__/
+*.py[cod]
+*$py.class
+venv/
+.venv/
+env/
+.env
+
+# Data and artifacts
+*.wav
+*.mp3
+*.log
+/tmp/
+
+# Model and other Large Files
+*.bin
+*.pt
+*.pth
+*.h5
+*.onnx
+
+# IDEs
+.vscode/
+.idea/
+.DS_Store
@@ -0,0 +1,31 @@
+# Use NVIDIA CUDA 12.6 image for compatibility
+FROM nvidia/cuda:12.6.2-devel-ubuntu22.04
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    git \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Copy application files
+COPY . .
+
+# Set environment variables
+ENV MODEL_ID="nari-labs/Dia-1.6B"
+ENV PORT=8000
+
+# Expose API port
+EXPOSE 8000
+
+# Run the FastAPI server
+CMD ["python3", "main.py"]
@@ -0,0 +1,87 @@
+# Dia-1.6B API Server
+
+API server for [nari-labs/Dia-1.6B](https://huggingface.co/nari-labs/Dia-1.6B), a 1.6 billion-parameter text-to-speech (TTS) model designed for realistic dialogue generation.
+
+## Features
+- 🗣️ **Realistic Dialogue**: Directly generates natural-sounding conversations from transcripts.
+- 🎭 **Emotion and Tone**: Supports non-verbal cues like `(laughs)`, `(coughs)`, and `(clears throat)`.
+- 👥 **Multi-Speaker Support**: Uses tags like `[S1]` and `[S2]` to alternate between speakers.
+- 🎙️ **Audio Prompting**: Supports voice conditioning and cloning via audio prompts.
+- 🚀 **FastAPI Implementation**: High-performance, documented API endpoints.
+
+## Prerequisites
+- **Python 3.9+**
+- **NVIDIA GPU (Recommended)**: 10GB+ VRAM for optimal performance.
+- **CUDA 12.6+** (Mandatory for inference).
+
+## Installation
+
+1. **Clone the repository and navigate into the folder:**
+   ```bash
+   git clone <repo-url>
+   cd dia-api-server
+   ```
+
+2. **Create a virtual environment:**
+   ```bash
+   python -m venv .venv
+   source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+   ```
+
+3. **Install dependencies:**
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+## Usage
+
+### Running the Server
+```bash
+python main.py
+```
+The server will be available at `http://localhost:8000`.
+
+### API Documentation
+Once the server is running, you can access the interactive documentation at:
+- Swagger UI: `http://localhost:8000/docs`
+- Redoc: `http://localhost:8000/redoc`
+
+### Example Endpoint: `/generate` (POST)
+**Parameters:**
+- `text` (Form data): The transcript including speaker tags.
+- `audio_prompt` (Form file, optional): An audio file to condition the generation.
+
+**Response:**
+Returns a `StreamingResponse` as a `audio/wav` binary stream.
+
+### Test Script
+You can use `test_api.py` to verify the server:
+```bash
+python test_api.py
+```
+
+## Docker Deployment (Recommended)
+Developing and running locally may be complicated due to CUDA requirements. Here is a sample `Dockerfile` for deployment:
+
+```dockerfile
+FROM nvidia/cuda:12.6.0-devel-ubuntu22.04
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    git \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt
+
+COPY . .
+
+CMD ["python3", "main.py"]
+```
+
+## License
+Refer to the [nari-labs/Dia-1.6B](https://huggingface.co/nari-labs/Dia-1.6B#🪪-license) license on Hugging Face.
@@ -0,0 +1,104 @@
+import io
+import os
+import torch
+import soundfile as sf
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from typing import Optional
+import uvicorn
+
+# We try to import Dia. In a real environment, this would be installed via requirements.txt
+try:
+    from dia.model import Dia
+except ImportError:
+    # Fallback for development if not installed
+    Dia = None
+
+app = FastAPI(
+    title="Dia-1.6B API Server",
+    description="API server for Nari Labs Dia-1.6B TTS model. Supports realistic dialogue generation, speaker tags, and audio prompting.",
+    version="1.0.0"
+)
+
+# Global model instance
+model = None
+
+@app.on_event("startup")
+async def load_model():
+    """Load the model on startup."""
+    global model
+    if Dia is None:
+        print("Warning: 'dia' library not found. Model will not be loaded.")
+        return
+
+    model_id = os.getenv("MODEL_ID", "nari-labs/Dia-1.6B")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Loading model {model_id} on {device}...")
+    
+    try:
+        model = Dia.from_pretrained(model_id)
+        if device == "cuda":
+            model = model.to(device)
+        print("Model loaded successfully.")
+    except Exception as e:
+        print(f"Error loading model: {e}")
+
+@app.post("/generate", summary="Generate audio from text")
+async def generate(
+    text: str = Form(..., description="The transcript text to generate audio for. Use speaker tags like [S1], [S2]."),
+    audio_prompt: Optional[UploadFile] = File(None, description="Optional audio file for voice cloning/conditioning.")
+):
+    """
+    Generate realistic dialogue audio from a transcript.
+    Supports speaker tags [S1], [S2], and non-verbal cues like (laughs).
+    """
+    if model is None:
+        raise HTTPException(status_code=503, detail="Model not loaded or 'dia' library missing.")
+
+    try:
+        prompt_path = None
+        if audio_prompt:
+            # Temporary file for the audio prompt
+            prompt_path = f"/tmp/prompt_{audio_prompt.filename}"
+            with open(prompt_path, "wb") as f:
+                content = await audio_prompt.read()
+                f.write(content)
+
+        # Generate audio using the model
+        # According to documentation, generate returns a numpy array
+        # Signature: model.generate(text, audio_prompt=None, ...)
+        # We pass prompt_path if available
+        output = model.generate(text, audio_prompt=prompt_path)
+
+        # Convert numpy array to WAV bytes
+        output_buffer = io.BytesIO()
+        sf.write(output_buffer, output, 44100, format='WAV')
+        output_buffer.seek(0)
+
+        # Cleanup prompt file
+        if prompt_path and os.path.exists(prompt_path):
+            os.remove(prompt_path)
+
+        return StreamingResponse(
+            output_buffer, 
+            media_type="audio/wav",
+            headers={"Content-Disposition": "attachment; filename=output.wav"}
+        )
+
+    except Exception as e:
+        print(f"Generation error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/health", summary="Health check")
+def health():
+    """Check if the server is alive and the model is loaded."""
+    return {
+        "status": "healthy",
+        "model_loaded": model is not None,
+        "device": "cuda" if torch.cuda.is_available() else "cpu"
+    }
+
+if __name__ == "__main__":
+    port = int(os.getenv("PORT", 8000))
+    uvicorn.run(app, host="0.0.0.0", port=port)
@@ -0,0 +1,8 @@
+fastapi
+uvicorn
+torch>=2.0
+transformers
+soundfile
+pydantic
+python-multipart
+git+https://github.com/nari-labs/dia.git
@@ -0,0 +1,56 @@
+import requests
+from pathlib import Path
+
+# API endpoint
+URL = "http://localhost:8000/generate"
+
+def test_generation(text: str, output_file: str = "output.wav", audio_prompt: str = None):
+    """
+    Test the Dia API generation endpoint.
+    
+    Args:
+        text (str): Transcript text with speaker tags.
+        output_file (str): Filename to save the output audio.
+        audio_prompt (str): Optional path to an audio prompt file.
+    """
+    # Using 'Form' and 'File' parameters in requests
+    data = {"text": text}
+    files = {}
+
+    if audio_prompt and Path(audio_prompt).exists():
+        files["audio_prompt"] = open(audio_prompt, "rb")
+
+    print(f"Calling generation endpoint with text: '{text[:100]}...'")
+    
+    try:
+        # Use POST request with timeout
+        response = requests.post(URL, data=data, files=files, timeout=60)
+        response.raise_for_status()
+
+        # Save the audio output
+        with open(output_file, "wb") as f:
+            f.write(response.content)
+        
+        print(f"Successfully generated audio. Saved as '{output_file}'")
+    
+    except requests.exceptions.RequestException as e:
+        print(f"Error calling API: {e}")
+        if response and response.content:
+            print(f"Server error details: {response.text}")
+    finally:
+        if files:
+            files["audio_prompt"].close()
+
+if __name__ == "__main__":
+    # Sample transcript with realistic dialogue features
+    SAMPLE_TEXT = (
+        "[S1] Dia is an open weights text to dialogue model. (laughs) "
+        "[S2] It allows full control over scripts and voices. "
+        "[S1] Wow. Really impressive."
+    )
+    
+    # Check if we should use a specific prompt
+    # test_generation(SAMPLE_TEXT, audio_prompt="path/to/my_voice.wav")
+    
+    print("Testing basic generation...")
+    test_generation(SAMPLE_TEXT)