Add: VibeVoice ASR セットアップスクリプト一式

2026-02-24 01:21:33 +00:00 · 2026-02-24 01:21:33 +00:00 · 1fb76254e9
commit 1fb76254e9
parent 2d753f114f
15 changed files with 4531 additions and 0 deletions
--- a/static/scripts/vibevoice-asr/Dockerfile
+++ b/static/scripts/vibevoice-asr/Dockerfile
@ -0,0 +1,61 @@
 # VibeVoice-ASR for DGX Spark (ARM64, Blackwell GB10, sm_121)
 # Based on NVIDIA PyTorch container for CUDA 13.1 compatibility
 ARG TARGETARCH
 FROM nvcr.io/nvidia/pytorch:25.11-py3 AS base
 LABEL maintainer="VibeVoice-ASR DGX Spark Setup"
 LABEL description="VibeVoice-ASR optimized for DGX Spark (ARM64, CUDA 13.1)"
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 # PyTorch CUDA settings for DGX Spark
 ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 ENV USE_LIBUV=0
 # Set working directory
 WORKDIR /workspace
 # Install system dependencies including FFmpeg for demo
 RUN apt-get update && apt-get install -y --no-install-recommends \
    ffmpeg \
    git \
    curl \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
 # Install flash-attn if not already present
 RUN pip install --no-cache-dir flash-attn --no-build-isolation || true
 # Clone and install VibeVoice
 RUN git clone https://github.com/microsoft/VibeVoice.git /workspace/VibeVoice && \
    cd /workspace/VibeVoice && \
    pip install --no-cache-dir -e .
 # Create test script and patched demo with MKV support
 COPY test_vibevoice.py /workspace/test_vibevoice.py
 COPY vibevoice_asr_gradio_demo_patched.py /workspace/VibeVoice/demo/vibevoice_asr_gradio_demo.py
 # Install real-time ASR dependencies
 COPY requirements-realtime.txt /workspace/requirements-realtime.txt
 RUN pip install --no-cache-dir -r /workspace/requirements-realtime.txt
 # Copy real-time ASR module and startup scripts
 COPY realtime/ /workspace/VibeVoice/realtime/
 COPY static/ /workspace/VibeVoice/static/
 COPY run_all.sh /workspace/VibeVoice/run_all.sh
 COPY run_realtime.sh /workspace/VibeVoice/run_realtime.sh
 RUN chmod +x /workspace/VibeVoice/run_all.sh /workspace/VibeVoice/run_realtime.sh
 # Set default working directory to VibeVoice
 WORKDIR /workspace/VibeVoice
 # Expose Gradio port and WebSocket port
 EXPOSE 7860
 EXPOSE 8000
 # Default command: Launch Gradio demo with MKV support
 CMD ["python", "demo/vibevoice_asr_gradio_demo.py", "--model_path", "microsoft/VibeVoice-ASR", "--host", "0.0.0.0"]
--- a/static/scripts/vibevoice-asr/realtime/init.py
+++ b/static/scripts/vibevoice-asr/realtime/init.py
@ -0,0 +1,7 @@
 """
 VibeVoice Realtime ASR Module
 WebSocket-based real-time speech recognition using VibeVoice ASR.
 """
 __version__ = "0.1.0"
--- a/static/scripts/vibevoice-asr/realtime/asr_worker.py
+++ b/static/scripts/vibevoice-asr/realtime/asr_worker.py
@ -0,0 +1,358 @@
 """
 ASR Worker for real-time transcription.
 Wraps the existing VibeVoiceASRInference for async/streaming operation.
 """
 import sys
 import os
 import asyncio
 import threading
 import time
 import queue
 from typing import AsyncGenerator, Optional, List, Callable
 from dataclasses import dataclass
 import numpy as np
 import torch
 # Add parent directory and demo directory to path for importing existing code
 _parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, _parent_dir)
 sys.path.insert(0, os.path.join(_parent_dir, "demo"))
 from .models import (
    TranscriptionResult,
    TranscriptionSegment,
    MessageType,
    SessionConfig,
 )
@dataclass
 class InferenceRequest:
    """Request for ASR inference."""
    audio: np.ndarray
    sample_rate: int
    context_info: Optional[str]
    request_time: float
    segment_start_sec: float
    segment_end_sec: float
 class ASRWorker:
    """
    ASR Worker that wraps VibeVoiceASRInference for real-time use.
    Features:
    - Async interface for WebSocket integration
    - Streaming output via TextIteratorStreamer
    - Request queuing for handling concurrent segments
    - Graceful model loading and error handling
    """
    def __init__(
        self,
        model_path: str = "microsoft/VibeVoice-ASR",
        device: str = "cuda",
        dtype: torch.dtype = torch.bfloat16,
        attn_implementation: str = "flash_attention_2",
    ):
        """
        Initialize the ASR worker.
        Args:
            model_path: Path to VibeVoice ASR model
            device: Device to run inference on
            dtype: Model data type
            attn_implementation: Attention implementation
        """
        self.model_path = model_path
        self.device = device
        self.dtype = dtype
        self.attn_implementation = attn_implementation
        self._inference = None
        self._is_loaded = False
        self._load_lock = threading.Lock()
        # Inference queue for serializing requests
        self._inference_semaphore = asyncio.Semaphore(1)
    def load_model(self) -> bool:
        """
        Load the ASR model.
        Returns:
            True if model loaded successfully
        """
        with self._load_lock:
            if self._is_loaded:
                return True
            try:
                # Import here to avoid circular imports and allow lazy loading
                # In Docker, the file is copied as vibevoice_asr_gradio_demo.py
                try:
                    from vibevoice_asr_gradio_demo import VibeVoiceASRInference
                except ImportError:
                    from vibevoice_asr_gradio_demo_patched import VibeVoiceASRInference
                print(f"Loading VibeVoice ASR model from {self.model_path}...")
                self._inference = VibeVoiceASRInference(
                    model_path=self.model_path,
                    device=self.device,
                    dtype=self.dtype,
                    attn_implementation=self.attn_implementation,
                )
                self._is_loaded = True
                print("ASR model loaded successfully")
                return True
            except Exception as e:
                print(f"Failed to load ASR model: {e}")
                import traceback
                traceback.print_exc()
                return False
    @property
    def is_loaded(self) -> bool:
        """Check if model is loaded."""
        return self._is_loaded
    async def transcribe_segment(
        self,
        audio: np.ndarray,
        sample_rate: int = 16000,
        context_info: Optional[str] = None,
        segment_start_sec: float = 0.0,
        segment_end_sec: float = 0.0,
        config: Optional[SessionConfig] = None,
        on_partial: Optional[Callable[[TranscriptionResult], None]] = None,
    ) -> TranscriptionResult:
        """
        Transcribe an audio segment asynchronously.
        Args:
            audio: Audio data as float32 array
            sample_rate: Audio sample rate
            context_info: Optional context for transcription
            segment_start_sec: Start time of segment in session
            segment_end_sec: End time of segment in session
            config: Session configuration
            on_partial: Callback for partial results
        Returns:
            Final transcription result
        """
        if not self._is_loaded:
            if not self.load_model():
                return TranscriptionResult(
                    type=MessageType.ERROR,
                    text="",
                    is_final=True,
                    latency_ms=0,
                )
        config = config or SessionConfig()
        request_time = time.time()
        # Serialize inference requests
        async with self._inference_semaphore:
            return await self._run_inference(
                audio=audio,
                sample_rate=sample_rate,
                context_info=context_info,
                segment_start_sec=segment_start_sec,
                segment_end_sec=segment_end_sec,
                config=config,
                request_time=request_time,
                on_partial=on_partial,
            )
    async def _run_inference(
        self,
        audio: np.ndarray,
        sample_rate: int,
        context_info: Optional[str],
        segment_start_sec: float,
        segment_end_sec: float,
        config: SessionConfig,
        request_time: float,
        on_partial: Optional[Callable[[TranscriptionResult], None]],
    ) -> TranscriptionResult:
        """Run the actual inference in a thread pool."""
        from transformers import TextIteratorStreamer
        # Create streamer for partial results
        streamer = None
        if config.return_partial_results and on_partial:
            streamer = TextIteratorStreamer(
                self._inference.processor.tokenizer,
                skip_prompt=True,
                skip_special_tokens=True,
            )
        # Result container for thread
        result_container = {"result": None, "error": None}
        def run_inference():
            try:
                # Save audio to temp file (required by current implementation)
                import tempfile
                import soundfile as sf
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                    temp_path = f.name
                # Write audio
                audio_int16 = (audio * 32768.0).clip(-32768, 32767).astype(np.int16)
                sf.write(temp_path, audio_int16, sample_rate, subtype='PCM_16')
                try:
                    result = self._inference.transcribe(
                        audio_path=temp_path,
                        max_new_tokens=config.max_new_tokens,
                        temperature=config.temperature,
                        context_info=context_info,
                        streamer=streamer,
                    )
                    result_container["result"] = result
                finally:
                    # Clean up temp file
                    try:
                        os.unlink(temp_path)
                    except:
                        pass
            except Exception as e:
                result_container["error"] = str(e)
                import traceback
                traceback.print_exc()
        # Start inference in background thread
        inference_thread = threading.Thread(target=run_inference)
        inference_thread.start()
        # Stream partial results if enabled
        partial_text = ""
        if streamer and on_partial:
            try:
                for new_text in streamer:
                    partial_text += new_text
                    partial_result = TranscriptionResult(
                        type=MessageType.PARTIAL_RESULT,
                        text=partial_text,
                        is_final=False,
                        latency_ms=(time.time() - request_time) * 1000,
                    )
                    # Call callback (may be async)
                    if asyncio.iscoroutinefunction(on_partial):
                        await on_partial(partial_result)
                    else:
                        on_partial(partial_result)
            except Exception as e:
                print(f"Error during streaming: {e}")
        # Wait for completion
        inference_thread.join()
        latency_ms = (time.time() - request_time) * 1000
        if result_container["error"]:
            return TranscriptionResult(
                type=MessageType.ERROR,
                text=f"Error: {result_container['error']}",
                is_final=True,
                latency_ms=latency_ms,
            )
        result = result_container["result"]
        # Convert segments to our format
        segments = []
        for seg in result.get("segments", []):
            # Adjust timestamps relative to session
            seg_start = seg.get("start_time", 0)
            seg_end = seg.get("end_time", 0)
            # If segment has relative timestamps, adjust to absolute
            if isinstance(seg_start, (int, float)) and isinstance(seg_end, (int, float)):
                adjusted_start = segment_start_sec + seg_start
                adjusted_end = segment_start_sec + seg_end
            else:
                adjusted_start = segment_start_sec
                adjusted_end = segment_end_sec
            segments.append(TranscriptionSegment(
                start_time=adjusted_start,
                end_time=adjusted_end,
                speaker_id=seg.get("speaker_id", "SPEAKER_00"),
                text=seg.get("text", ""),
            ))
        return TranscriptionResult(
            type=MessageType.FINAL_RESULT,
            text=result.get("raw_text", ""),
            is_final=True,
            segments=segments,
            latency_ms=latency_ms,
        )
    async def transcribe_stream(
        self,
        audio: np.ndarray,
        sample_rate: int = 16000,
        context_info: Optional[str] = None,
        segment_start_sec: float = 0.0,
        segment_end_sec: float = 0.0,
        config: Optional[SessionConfig] = None,
    ) -> AsyncGenerator[TranscriptionResult, None]:
        """
        Transcribe an audio segment with streaming output.
        Yields partial results followed by final result.
        Args:
            audio: Audio data
            sample_rate: Sample rate
            context_info: Optional context
            segment_start_sec: Segment start time
            segment_end_sec: Segment end time
            config: Session config
        Yields:
            TranscriptionResult objects (partial and final)
        """
        result_queue: asyncio.Queue = asyncio.Queue()
        async def on_partial(result: TranscriptionResult):
            await result_queue.put(result)
        # Start transcription task
        transcribe_task = asyncio.create_task(
            self.transcribe_segment(
                audio=audio,
                sample_rate=sample_rate,
                context_info=context_info,
                segment_start_sec=segment_start_sec,
                segment_end_sec=segment_end_sec,
                config=config,
                on_partial=on_partial,
            )
        )
        # Yield partial results as they come
        while not transcribe_task.done():
            try:
                result = await asyncio.wait_for(result_queue.get(), timeout=0.1)
                yield result
            except asyncio.TimeoutError:
                continue
        # Drain any remaining partial results
        while not result_queue.empty():
            yield await result_queue.get()
        # Yield final result
        final_result = await transcribe_task
        yield final_result
--- a/static/scripts/vibevoice-asr/realtime/audio_buffer.py
+++ b/static/scripts/vibevoice-asr/realtime/audio_buffer.py
@ -0,0 +1,246 @@
 """
 Audio buffer management for real-time ASR.
 Implements a ring buffer for efficient audio chunk management with overlap support.
 """
 import numpy as np
 from typing import Optional, Tuple
 from dataclasses import dataclass
 import threading
@dataclass
 class AudioChunkInfo:
    """Information about an extracted audio chunk."""
    audio: np.ndarray
    start_sample: int
    end_sample: int
    start_sec: float
    end_sec: float
 class AudioBuffer:
    """
    Ring buffer for managing audio chunks with overlap support.
    Features:
    - Efficient memory management with fixed-size buffer
    - Overlap handling for continuous processing
    - Thread-safe operations
    - Automatic sample rate tracking
    """
    def __init__(
        self,
        sample_rate: int = 16000,
        chunk_duration_sec: float = 3.0,
        overlap_sec: float = 0.5,
        max_buffer_sec: float = 60.0,
    ):
        """
        Initialize the audio buffer.
        Args:
            sample_rate: Audio sample rate in Hz
            chunk_duration_sec: Duration of each processing chunk
            overlap_sec: Overlap between consecutive chunks
            max_buffer_sec: Maximum buffer duration (older data will be discarded)
        """
        self.sample_rate = sample_rate
        self.chunk_size = int(chunk_duration_sec * sample_rate)
        self.overlap_size = int(overlap_sec * sample_rate)
        self.max_buffer_size = int(max_buffer_sec * sample_rate)
        # Main buffer (pre-allocated)
        self._buffer = np.zeros(self.max_buffer_size, dtype=np.float32)
        self._write_pos = 0  # Next position to write
        self._read_pos = 0   # Position of unprocessed data start
        self._total_samples_received = 0  # Total samples since session start
        self._lock = threading.Lock()
    @property
    def samples_available(self) -> int:
        """Number of unprocessed samples in buffer."""
        with self._lock:
            return self._write_pos - self._read_pos
    @property
    def duration_available_sec(self) -> float:
        """Duration of unprocessed audio in seconds."""
        return self.samples_available / self.sample_rate
    @property
    def total_duration_sec(self) -> float:
        """Total duration of audio received since session start."""
        return self._total_samples_received / self.sample_rate
    def append(self, audio_chunk: np.ndarray) -> int:
        """
        Append audio chunk to the buffer.
        Args:
            audio_chunk: Audio data as float32 array (range: -1.0 to 1.0)
        Returns:
            Number of samples actually appended
        """
        if audio_chunk.dtype != np.float32:
            audio_chunk = audio_chunk.astype(np.float32)
        # Ensure 1D
        if audio_chunk.ndim > 1:
            audio_chunk = audio_chunk.flatten()
        with self._lock:
            chunk_len = len(audio_chunk)
            # Check if we need to shift buffer (running out of space)
            if self._write_pos + chunk_len > self.max_buffer_size:
                self._compact_buffer()
            # Still not enough space? Discard old unprocessed data
            if self._write_pos + chunk_len > self.max_buffer_size:
                overflow = (self._write_pos + chunk_len) - self.max_buffer_size
                self._read_pos = min(self._read_pos + overflow, self._write_pos)
                self._compact_buffer()
            # Write to buffer
            end_pos = self._write_pos + chunk_len
            self._buffer[self._write_pos:end_pos] = audio_chunk
            self._write_pos = end_pos
            self._total_samples_received += chunk_len
            return chunk_len
    def _compact_buffer(self) -> None:
        """Move unprocessed data to the beginning of the buffer."""
        if self._read_pos > 0:
            unprocessed_len = self._write_pos - self._read_pos
            if unprocessed_len > 0:
                self._buffer[:unprocessed_len] = self._buffer[self._read_pos:self._write_pos]
            self._write_pos = unprocessed_len
            self._read_pos = 0
    def get_chunk_for_inference(self, min_duration_sec: float = 0.5) -> Optional[AudioChunkInfo]:
        """
        Get the next chunk for ASR inference.
        Returns a chunk of audio when enough data is available.
        The chunk includes overlap from the previous chunk for context.
        Args:
            min_duration_sec: Minimum duration required to return a chunk
        Returns:
            AudioChunkInfo if enough data is available, None otherwise
        """
        min_samples = int(min_duration_sec * self.sample_rate)
        with self._lock:
            available = self._write_pos - self._read_pos
            if available < min_samples:
                return None
            # Calculate chunk boundaries
            chunk_start = self._read_pos
            chunk_end = min(self._read_pos + self.chunk_size, self._write_pos)
            actual_chunk_size = chunk_end - chunk_start
            # Extract audio
            audio = self._buffer[chunk_start:chunk_end].copy()
            # Calculate timestamps based on total samples received
            base_sample = self._total_samples_received - (self._write_pos - chunk_start)
            start_sec = base_sample / self.sample_rate
            end_sec = (base_sample + actual_chunk_size) / self.sample_rate
            return AudioChunkInfo(
                audio=audio,
                start_sample=base_sample,
                end_sample=base_sample + actual_chunk_size,
                start_sec=start_sec,
                end_sec=end_sec,
            )
    def mark_processed(self, samples: int) -> None:
        """
        Mark samples as processed, advancing the read position.
        Keeps overlap_size samples for context in the next chunk.
        Args:
            samples: Number of samples that were processed
        """
        with self._lock:
            # Advance read position but keep overlap for context
            advance = max(0, samples - self.overlap_size)
            self._read_pos = min(self._read_pos + advance, self._write_pos)
    def get_segment(self, start_sec: float, end_sec: float) -> Optional[np.ndarray]:
        """
        Get a specific time segment from the buffer.
        Args:
            start_sec: Start time in seconds (relative to session start)
            end_sec: End time in seconds
        Returns:
            Audio segment if available, None otherwise
        """
        start_sample = int(start_sec * self.sample_rate)
        end_sample = int(end_sec * self.sample_rate)
        with self._lock:
            # Calculate buffer positions
            buffer_start_sample = self._total_samples_received - self._write_pos
            buffer_end_sample = self._total_samples_received
            # Check if segment is in buffer
            if start_sample < buffer_start_sample or end_sample > buffer_end_sample:
                return None
            # Convert to buffer indices
            buf_start = start_sample - buffer_start_sample
            buf_end = end_sample - buffer_start_sample
            return self._buffer[buf_start:buf_end].copy()
    def get_all_unprocessed(self) -> Optional[AudioChunkInfo]:
        """
        Get all unprocessed audio.
        Returns:
            AudioChunkInfo with all unprocessed audio, or None if empty
        """
        with self._lock:
            if self._write_pos <= self._read_pos:
                return None
            audio = self._buffer[self._read_pos:self._write_pos].copy()
            base_sample = self._total_samples_received - (self._write_pos - self._read_pos)
            start_sec = base_sample / self.sample_rate
            end_sec = self._total_samples_received / self.sample_rate
            return AudioChunkInfo(
                audio=audio,
                start_sample=base_sample,
                end_sample=self._total_samples_received,
                start_sec=start_sec,
                end_sec=end_sec,
            )
    def clear(self) -> None:
        """Clear the buffer and reset all positions."""
        with self._lock:
            self._buffer.fill(0)
            self._write_pos = 0
            self._read_pos = 0
            self._total_samples_received = 0
    def reset_read_position(self) -> None:
        """Reset read position to current write position (skip all unprocessed)."""
        with self._lock:
            self._read_pos = self._write_pos
--- a/static/scripts/vibevoice-asr/realtime/models.py
+++ b/static/scripts/vibevoice-asr/realtime/models.py
@ -0,0 +1,154 @@
 """
 Data models for real-time ASR WebSocket communication.
 """
 from enum import Enum
 from typing import Optional, List, Dict, Any
 from dataclasses import dataclass, field, asdict
 import time
 class MessageType(str, Enum):
    """WebSocket message types."""
    # Client -> Server
    AUDIO_CHUNK = "audio_chunk"
    CONFIG = "config"
    START = "start"
    STOP = "stop"
    # Server -> Client
    PARTIAL_RESULT = "partial_result"
    FINAL_RESULT = "final_result"
    VAD_EVENT = "vad_event"
    ERROR = "error"
    STATUS = "status"
 class VADEventType(str, Enum):
    """VAD event types."""
    SPEECH_START = "speech_start"
    SPEECH_END = "speech_end"
@dataclass
 class SessionConfig:
    """Configuration for a real-time ASR session."""
    # Audio parameters
    sample_rate: int = 16000
    chunk_duration_sec: float = 3.0
    overlap_sec: float = 0.5
    # VAD parameters
    vad_threshold: float = 0.5
    min_speech_duration_ms: int = 250
    min_silence_duration_ms: int = 500
    min_volume_threshold: float = 0.01  # Minimum RMS volume (0.0-1.0) to consider as potential speech
    # ASR parameters
    max_new_tokens: int = 512
    temperature: float = 0.0
    context_info: Optional[str] = None
    # Behavior
    return_partial_results: bool = True
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "SessionConfig":
        return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
@dataclass
 class TranscriptionSegment:
    """A single transcription segment with metadata."""
    start_time: float
    end_time: float
    speaker_id: str
    text: str
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class TranscriptionResult:
    """Transcription result message."""
    type: MessageType
    text: str
    is_final: bool
    segments: List[TranscriptionSegment] = field(default_factory=list)
    latency_ms: float = 0.0
    timestamp: float = field(default_factory=time.time)
    def to_dict(self) -> Dict[str, Any]:
        return {
            "type": self.type.value,
            "text": self.text,
            "is_final": self.is_final,
            "segments": [s.to_dict() for s in self.segments],
            "latency_ms": self.latency_ms,
            "timestamp": self.timestamp,
        }
@dataclass
 class VADEvent:
    """VAD event message."""
    type: MessageType = MessageType.VAD_EVENT
    event: VADEventType = VADEventType.SPEECH_START
    timestamp: float = field(default_factory=time.time)
    audio_timestamp_sec: float = 0.0
    def to_dict(self) -> Dict[str, Any]:
        return {
            "type": self.type.value,
            "event": self.event.value,
            "timestamp": self.timestamp,
            "audio_timestamp_sec": self.audio_timestamp_sec,
        }
@dataclass
 class StatusMessage:
    """Status message."""
    type: MessageType = MessageType.STATUS
    status: str = ""
    message: str = ""
    timestamp: float = field(default_factory=time.time)
    def to_dict(self) -> Dict[str, Any]:
        return {
            "type": self.type.value,
            "status": self.status,
            "message": self.message,
            "timestamp": self.timestamp,
        }
@dataclass
 class ErrorMessage:
    """Error message."""
    type: MessageType = MessageType.ERROR
    error: str = ""
    code: str = ""
    timestamp: float = field(default_factory=time.time)
    def to_dict(self) -> Dict[str, Any]:
        return {
            "type": self.type.value,
            "error": self.error,
            "code": self.code,
            "timestamp": self.timestamp,
        }
@dataclass
 class SpeechSegment:
    """Detected speech segment from VAD."""
    start_sample: int
    end_sample: int
    start_sec: float
    end_sec: float
    confidence: float = 1.0
--- a/static/scripts/vibevoice-asr/realtime/server.py
+++ b/static/scripts/vibevoice-asr/realtime/server.py
@ -0,0 +1,300 @@
 """
 FastAPI WebSocket server for real-time ASR.
 Provides WebSocket endpoint for streaming audio and receiving transcriptions.
 """
 import os
 import sys
 import asyncio
 import json
 import time
 from typing import Optional
 from contextlib import asynccontextmanager
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import HTMLResponse, JSONResponse
 import uvicorn
 # Add parent directory to path
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from .models import (
    SessionConfig,
    TranscriptionResult,
    VADEvent,
    StatusMessage,
    ErrorMessage,
    MessageType,
 )
 from .asr_worker import ASRWorker
 from .session_manager import SessionManager
 # Global instances
 asr_worker: Optional[ASRWorker] = None
 session_manager: Optional[SessionManager] = None
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """Application lifespan manager."""
    global asr_worker, session_manager
    # Startup
    print("Starting VibeVoice Realtime ASR Server...")
    # Get model path from environment or use default
    model_path = os.environ.get("VIBEVOICE_MODEL_PATH", "microsoft/VibeVoice-ASR")
    device = os.environ.get("VIBEVOICE_DEVICE", "cuda")
    attn_impl = os.environ.get("VIBEVOICE_ATTN_IMPL", "flash_attention_2")
    # Initialize ASR worker
    asr_worker = ASRWorker(
        model_path=model_path,
        device=device,
        attn_implementation=attn_impl,
    )
    # Pre-load model (optional, can be lazy-loaded on first request)
    preload = os.environ.get("VIBEVOICE_PRELOAD_MODEL", "true").lower() == "true"
    if preload:
        print("Pre-loading ASR model...")
        asr_worker.load_model()
    # Initialize session manager
    max_sessions = int(os.environ.get("VIBEVOICE_MAX_SESSIONS", "10"))
    session_manager = SessionManager(
        asr_worker=asr_worker,
        max_concurrent_sessions=max_sessions,
    )
    await session_manager.start()
    print("Server ready!")
    yield
    # Shutdown
    print("Shutting down...")
    await session_manager.stop()
 # Create FastAPI app
 app = FastAPI(
    title="VibeVoice Realtime ASR",
    description="Real-time speech recognition using VibeVoice ASR",
    version="0.1.0",
    lifespan=lifespan,
 )
 # Mount static files
 static_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "static")
 if os.path.exists(static_dir):
    app.mount("/static", StaticFiles(directory=static_dir), name="static")
@app.get("/")
 async def root():
    """Root endpoint with API info."""
    return {
        "service": "VibeVoice Realtime ASR",
        "version": "0.1.0",
        "endpoints": {
            "websocket": "/ws/asr/{session_id}",
            "health": "/health",
            "stats": "/stats",
            "client": "/static/realtime_client.html",
        },
    }
@app.get("/health")
 async def health_check():
    """Health check endpoint."""
    return {
        "status": "healthy",
        "model_loaded": asr_worker.is_loaded if asr_worker else False,
        "active_sessions": len(session_manager._sessions) if session_manager else 0,
    }
@app.get("/stats")
 async def get_stats():
    """Get server statistics."""
    if session_manager is None:
        raise HTTPException(status_code=503, detail="Server not initialized")
    return session_manager.get_stats()
@app.websocket("/ws/asr/{session_id}")
 async def websocket_asr(websocket: WebSocket, session_id: str):
    """
    WebSocket endpoint for real-time ASR.
    Protocol:
    1. Client connects and optionally sends config message
    2. Client sends binary audio chunks (PCM 16-bit, 16kHz, mono)
    3. Server sends JSON messages with transcription results
    Message types (server -> client):
    - partial_result: Intermediate transcription
    - final_result: Complete transcription for a segment
    - vad_event: Speech start/end events
    - error: Error messages
    - status: Status updates
    """
    await websocket.accept()
    # Send connection confirmation
    await websocket.send_json(
        StatusMessage(
            status="connected",
            message=f"Session {session_id} connected",
        ).to_dict()
    )
    # Result callback
    async def on_result(result: TranscriptionResult):
        try:
            await websocket.send_json(result.to_dict())
        except Exception as e:
            print(f"[{session_id}] Failed to send result: {e}")
    # VAD event callback
    async def on_vad_event(event: VADEvent):
        try:
            await websocket.send_json(event.to_dict())
        except Exception as e:
            print(f"[{session_id}] Failed to send VAD event: {e}")
    # Create session
    session = await session_manager.create_session(
        session_id=session_id,
        on_result=on_result,
        on_vad_event=on_vad_event,
    )
    if session is None:
        await websocket.send_json(
            ErrorMessage(
                error="Maximum sessions reached",
                code="MAX_SESSIONS",
            ).to_dict()
        )
        await websocket.close()
        return
    await websocket.send_json(
        StatusMessage(
            status="ready",
            message="Session ready for audio",
        ).to_dict()
    )
    try:
        while True:
            # Receive message
            message = await websocket.receive()
            if message["type"] == "websocket.disconnect":
                break
            # Handle binary audio data
            if "bytes" in message:
                audio_data = message["bytes"]
                try:
                    await session.process_audio_chunk(audio_data)
                except Exception as e:
                    print(f"[{session_id}] Error processing audio: {e}")
                    import traceback
                    traceback.print_exc()
            # Handle JSON control messages
            elif "text" in message:
                try:
                    data = json.loads(message["text"])
                    msg_type = data.get("type")
                    if msg_type == "config":
                        # Update session config
                        config = SessionConfig.from_dict(data.get("config", {}))
                        session.update_config(config)
                        await websocket.send_json(
                            StatusMessage(
                                status="config_updated",
                                message="Configuration updated",
                            ).to_dict()
                        )
                    elif msg_type == "stop":
                        # Flush and close
                        await session.flush()
                        await websocket.send_json(
                            StatusMessage(
                                status="stopped",
                                message="Session stopped",
                            ).to_dict()
                        )
                        break
                    elif msg_type == "ping":
                        await websocket.send_json({"type": "pong", "timestamp": time.time()})
                except json.JSONDecodeError:
                    await websocket.send_json(
                        ErrorMessage(
                            error="Invalid JSON",
                            code="INVALID_JSON",
                        ).to_dict()
                    )
    except WebSocketDisconnect:
        print(f"[{session_id}] Client disconnected")
    except Exception as e:
        print(f"[{session_id}] Error: {e}")
        import traceback
        traceback.print_exc()
    finally:
        # Clean up session
        await session_manager.close_session(session_id)
        print(f"[{session_id}] Session closed")
 def main():
    """Main entry point."""
    import argparse
    parser = argparse.ArgumentParser(description="VibeVoice Realtime ASR Server")
    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
    parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
    parser.add_argument("--model-path", type=str, default="microsoft/VibeVoice-ASR",
                        help="Path to VibeVoice ASR model")
    parser.add_argument("--device", type=str, default="cuda", help="Device (cuda/cpu)")
    parser.add_argument("--max-sessions", type=int, default=10, help="Max concurrent sessions")
    parser.add_argument("--no-preload", action="store_true", help="Don't preload model")
    args = parser.parse_args()
    # Set environment variables for lifespan
    os.environ["VIBEVOICE_MODEL_PATH"] = args.model_path
    os.environ["VIBEVOICE_DEVICE"] = args.device
    os.environ["VIBEVOICE_MAX_SESSIONS"] = str(args.max_sessions)
    os.environ["VIBEVOICE_PRELOAD_MODEL"] = "false" if args.no_preload else "true"
    print(f"Starting server on {args.host}:{args.port}")
    print(f"Model: {args.model_path}")
    print(f"Device: {args.device}")
    print(f"Max sessions: {args.max_sessions}")
    uvicorn.run(
        app,
        host=args.host,
        port=args.port,
        log_level="info",
    )
 if __name__ == "__main__":
    main()
--- a/static/scripts/vibevoice-asr/realtime/session_manager.py
+++ b/static/scripts/vibevoice-asr/realtime/session_manager.py
@ -0,0 +1,401 @@
 """
 Session manager for real-time ASR.
 Manages multiple concurrent client sessions with resource isolation.
 """
 import asyncio
 import time
 import uuid
 from typing import Dict, Optional, Callable, Any
 from dataclasses import dataclass, field
 import threading
 from .models import (
    SessionConfig,
    TranscriptionResult,
    VADEvent,
    StatusMessage,
    ErrorMessage,
    MessageType,
    SpeechSegment,
 )
 from .audio_buffer import AudioBuffer
 from .vad_processor import VADProcessor
 from .asr_worker import ASRWorker
@dataclass
 class SessionStats:
    """Statistics for a session."""
    created_at: float = field(default_factory=time.time)
    last_activity: float = field(default_factory=time.time)
    audio_received_sec: float = 0.0
    chunks_received: int = 0
    segments_transcribed: int = 0
    total_latency_ms: float = 0.0
 class RealtimeSession:
    """
    A single real-time ASR session.
    Manages audio buffering, VAD, and ASR for one client connection.
    """
    def __init__(
        self,
        session_id: str,
        asr_worker: ASRWorker,
        config: Optional[SessionConfig] = None,
        on_result: Optional[Callable[[TranscriptionResult], Any]] = None,
        on_vad_event: Optional[Callable[[VADEvent], Any]] = None,
    ):
        """
        Initialize a session.
        Args:
            session_id: Unique session identifier
            asr_worker: Shared ASR worker instance
            config: Session configuration
            on_result: Callback for transcription results
            on_vad_event: Callback for VAD events
        """
        self.session_id = session_id
        self.asr_worker = asr_worker
        self.config = config or SessionConfig()
        self.on_result = on_result
        self.on_vad_event = on_vad_event
        # Components
        self.audio_buffer = AudioBuffer(
            sample_rate=self.config.sample_rate,
            chunk_duration_sec=self.config.chunk_duration_sec,
            overlap_sec=self.config.overlap_sec,
        )
        self.vad_processor = VADProcessor(
            sample_rate=self.config.sample_rate,
            threshold=self.config.vad_threshold,
            min_speech_duration_ms=self.config.min_speech_duration_ms,
            min_silence_duration_ms=self.config.min_silence_duration_ms,
            min_volume_threshold=self.config.min_volume_threshold,
        )
        # State
        self.is_active = True
        self.stats = SessionStats()
        self._processing_lock = asyncio.Lock()
        self._pending_tasks: list = []
    async def process_audio_chunk(self, audio_data: bytes) -> None:
        """
        Process an incoming audio chunk.
        Args:
            audio_data: Raw PCM audio data (16-bit, 16kHz, mono)
        """
        if not self.is_active:
            return
        self.stats.last_activity = time.time()
        self.stats.chunks_received += 1
        # Convert bytes to float32 array
        import numpy as np
        audio_int16 = np.frombuffer(audio_data, dtype=np.int16)
        audio_float = audio_int16.astype(np.float32) / 32768.0
        self.stats.audio_received_sec += len(audio_float) / self.config.sample_rate
        # Add to buffer
        self.audio_buffer.append(audio_float)
        # Process with VAD
        segments, events = self.vad_processor.process(audio_float)
        # Send VAD events
        if self.on_vad_event:
            for event in events:
                await self._send_callback(self.on_vad_event, event)
        # Process completed speech segments
        for segment in segments:
            try:
                await self._transcribe_segment(segment)
            except Exception as e:
                print(f"[Session {self.session_id}] Transcription error: {e}")
                import traceback
                traceback.print_exc()
    async def _transcribe_segment(self, segment: SpeechSegment) -> None:
        """Transcribe a detected speech segment."""
        # Get audio for segment from buffer
        audio = self.audio_buffer.get_segment(segment.start_sec, segment.end_sec)
        if audio is None or len(audio) == 0:
            print(f"[Session {self.session_id}] Could not retrieve audio for segment")
            return
        self.stats.segments_transcribed += 1
        async def on_partial(result: TranscriptionResult):
            if self.on_result:
                await self._send_callback(self.on_result, result)
        # Run transcription
        result = await self.asr_worker.transcribe_segment(
            audio=audio,
            sample_rate=self.config.sample_rate,
            context_info=self.config.context_info,
            segment_start_sec=segment.start_sec,
            segment_end_sec=segment.end_sec,
            config=self.config,
            on_partial=on_partial if self.config.return_partial_results else None,
        )
        self.stats.total_latency_ms += result.latency_ms
        # Send final result
        if self.on_result:
            await self._send_callback(self.on_result, result)
    async def _send_callback(self, callback: Callable, data: Any) -> None:
        """Send data via callback, handling both sync and async."""
        try:
            if asyncio.iscoroutinefunction(callback):
                await callback(data)
            else:
                callback(data)
        except Exception as e:
            print(f"[Session {self.session_id}] Callback error: {e}")
    async def flush(self) -> None:
        """
        Flush any remaining audio and force transcription.
        Called when session ends to process any remaining speech.
        """
        # Force end any active speech
        segment = self.vad_processor.force_end_speech()
        if segment:
            await self._transcribe_segment(segment)
        # Also check for any unprocessed audio in buffer
        chunk_info = self.audio_buffer.get_all_unprocessed()
        if chunk_info and len(chunk_info.audio) > self.config.sample_rate * 0.5:
            # More than 0.5 seconds of unprocessed audio
            forced_segment = SpeechSegment(
                start_sample=chunk_info.start_sample,
                end_sample=chunk_info.end_sample,
                start_sec=chunk_info.start_sec,
                end_sec=chunk_info.end_sec,
            )
            await self._transcribe_segment(forced_segment)
    def update_config(self, new_config: SessionConfig) -> None:
        """Update session configuration (partial update supported)."""
        # Merge with existing config - only update non-default values
        if new_config.vad_threshold != 0.5:
            self.config.vad_threshold = new_config.vad_threshold
        if new_config.min_speech_duration_ms != 250:
            self.config.min_speech_duration_ms = new_config.min_speech_duration_ms
        if new_config.min_silence_duration_ms != 500:
            self.config.min_silence_duration_ms = new_config.min_silence_duration_ms
        if new_config.min_volume_threshold != 0.01:
            self.config.min_volume_threshold = new_config.min_volume_threshold
        if new_config.context_info is not None:
            self.config.context_info = new_config.context_info
        # Recreate VAD processor with new parameters
        self.vad_processor = VADProcessor(
            sample_rate=self.config.sample_rate,
            threshold=self.config.vad_threshold,
            min_speech_duration_ms=self.config.min_speech_duration_ms,
            min_silence_duration_ms=self.config.min_silence_duration_ms,
            min_volume_threshold=self.config.min_volume_threshold,
        )
        print(f"[Session {self.session_id}] Config updated: vad_threshold={self.config.vad_threshold}, "
              f"min_speech={self.config.min_speech_duration_ms}ms, min_silence={self.config.min_silence_duration_ms}ms, "
              f"min_volume={self.config.min_volume_threshold}")
    def close(self) -> None:
        """Close the session and release resources."""
        self.is_active = False
        self.audio_buffer.clear()
        self.vad_processor.reset()
    def get_stats(self) -> Dict:
        """Get session statistics."""
        return {
            "session_id": self.session_id,
            "created_at": self.stats.created_at,
            "last_activity": self.stats.last_activity,
            "duration_sec": time.time() - self.stats.created_at,
            "audio_received_sec": self.stats.audio_received_sec,
            "chunks_received": self.stats.chunks_received,
            "segments_transcribed": self.stats.segments_transcribed,
            "avg_latency_ms": (
                self.stats.total_latency_ms / self.stats.segments_transcribed
                if self.stats.segments_transcribed > 0 else 0
            ),
            "is_active": self.is_active,
            "vad_speech_active": self.vad_processor.is_speech_active,
        }
 class SessionManager:
    """
    Manages multiple concurrent ASR sessions.
    Features:
    - Session creation and cleanup
    - Resource limiting (max concurrent sessions)
    - Idle session timeout
    - Shared ASR worker management
    """
    def __init__(
        self,
        asr_worker: ASRWorker,
        max_concurrent_sessions: int = 10,
        session_timeout_sec: float = 300.0,
    ):
        """
        Initialize the session manager.
        Args:
            asr_worker: Shared ASR worker
            max_concurrent_sessions: Maximum number of concurrent sessions
            session_timeout_sec: Timeout for idle sessions
        """
        self.asr_worker = asr_worker
        self.max_sessions = max_concurrent_sessions
        self.session_timeout = session_timeout_sec
        self._sessions: Dict[str, RealtimeSession] = {}
        self._lock = asyncio.Lock()
        # Cleanup task
        self._cleanup_task: Optional[asyncio.Task] = None
    async def start(self) -> None:
        """Start the session manager."""
        self._cleanup_task = asyncio.create_task(self._cleanup_loop())
    async def stop(self) -> None:
        """Stop the session manager and close all sessions."""
        if self._cleanup_task:
            self._cleanup_task.cancel()
            try:
                await self._cleanup_task
            except asyncio.CancelledError:
                pass
        async with self._lock:
            for session in self._sessions.values():
                session.close()
            self._sessions.clear()
    async def create_session(
        self,
        session_id: Optional[str] = None,
        config: Optional[SessionConfig] = None,
        on_result: Optional[Callable[[TranscriptionResult], Any]] = None,
        on_vad_event: Optional[Callable[[VADEvent], Any]] = None,
    ) -> Optional[RealtimeSession]:
        """
        Create a new session.
        Args:
            session_id: Optional session ID (generated if not provided)
            config: Session configuration
            on_result: Callback for results
            on_vad_event: Callback for VAD events
        Returns:
            Created session, or None if limit reached
        """
        async with self._lock:
            # Check session limit
            if len(self._sessions) >= self.max_sessions:
                return None
            # Generate session ID if not provided
            if session_id is None:
                session_id = str(uuid.uuid4())[:8]
            # Check for duplicate
            if session_id in self._sessions:
                return self._sessions[session_id]
            # Create session
            session = RealtimeSession(
                session_id=session_id,
                asr_worker=self.asr_worker,
                config=config,
                on_result=on_result,
                on_vad_event=on_vad_event,
            )
            self._sessions[session_id] = session
            return session
    async def get_session(self, session_id: str) -> Optional[RealtimeSession]:
        """Get a session by ID."""
        async with self._lock:
            return self._sessions.get(session_id)
    async def close_session(self, session_id: str) -> bool:
        """
        Close and remove a session.
        Args:
            session_id: Session to close
        Returns:
            True if session was found and closed
        """
        async with self._lock:
            session = self._sessions.pop(session_id, None)
            if session:
                await session.flush()
                session.close()
                return True
            return False
    async def _cleanup_loop(self) -> None:
        """Background task to clean up idle sessions."""
        while True:
            try:
                await asyncio.sleep(60)  # Check every minute
                current_time = time.time()
                sessions_to_close = []
                async with self._lock:
                    for session_id, session in self._sessions.items():
                        idle_time = current_time - session.stats.last_activity
                        if idle_time > self.session_timeout:
                            sessions_to_close.append(session_id)
                for session_id in sessions_to_close:
                    print(f"Closing idle session: {session_id}")
                    await self.close_session(session_id)
            except asyncio.CancelledError:
                break
            except Exception as e:
                print(f"Cleanup error: {e}")
    def get_stats(self) -> Dict:
        """Get manager statistics."""
        return {
            "active_sessions": len(self._sessions),
            "max_sessions": self.max_sessions,
            "session_timeout_sec": self.session_timeout,
            "sessions": {
                sid: session.get_stats()
                for sid, session in self._sessions.items()
            },
        }
--- a/static/scripts/vibevoice-asr/realtime/vad_processor.py
+++ b/static/scripts/vibevoice-asr/realtime/vad_processor.py
@ -0,0 +1,295 @@
 """
 Voice Activity Detection (VAD) processor using Silero-VAD (ONNX version).
 Detects speech segments in real-time audio streams.
 Uses ONNX runtime to avoid torchaudio dependency issues.
 """
 import numpy as np
 from typing import List, Optional, Tuple
 from dataclasses import dataclass
 import threading
 import os
 import urllib.request
 from .models import SpeechSegment, VADEvent, VADEventType, MessageType
@dataclass
 class VADState:
    """Internal state of the VAD processor."""
    is_speech_active: bool = False
    speech_start_sample: int = 0
    silence_start_sample: int = 0
    last_speech_prob: float = 0.0
    total_samples_processed: int = 0
 class VADProcessor:
    """
    Voice Activity Detection using Silero-VAD (ONNX version).
    Features:
    - Real-time speech detection
    - Configurable thresholds for speech/silence duration
    - Event generation for speech start/end
    - Thread-safe operations
    - No torchaudio dependency (uses ONNX runtime)
    """
    # Silero VAD ONNX model URL
    ONNX_MODEL_URL = "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
    def __init__(
        self,
        sample_rate: int = 16000,
        threshold: float = 0.5,
        min_speech_duration_ms: int = 250,
        min_silence_duration_ms: int = 500,
        window_size_samples: int = 512,
        min_volume_threshold: float = 0.01,
    ):
        """
        Initialize the VAD processor.
        Args:
            sample_rate: Audio sample rate (must be 16000 for Silero-VAD)
            threshold: Speech probability threshold (0.0-1.0)
            min_speech_duration_ms: Minimum speech duration to trigger speech_start
            min_silence_duration_ms: Minimum silence duration to trigger speech_end
            window_size_samples: VAD window size (512 for 16kHz = 32ms)
            min_volume_threshold: Minimum RMS volume (0.0-1.0) to consider as potential speech
        """
        if sample_rate != 16000:
            raise ValueError("Silero-VAD requires 16kHz sample rate")
        self.sample_rate = sample_rate
        self.threshold = threshold
        self.min_speech_samples = int(min_speech_duration_ms * sample_rate / 1000)
        self.min_silence_samples = int(min_silence_duration_ms * sample_rate / 1000)
        self.window_size = window_size_samples
        self.min_volume_threshold = min_volume_threshold
        # Load ONNX model
        self._session = None
        self._load_model()
        # ONNX model state - single state tensor (size depends on model version)
        # Silero VAD v5 uses a single 'state' tensor of shape (2, 1, 128)
        self._state_tensor = np.zeros((2, 1, 128), dtype=np.float32)
        # State
        self._state = VADState()
        self._lock = threading.Lock()
        # Pending speech segment (being accumulated)
        self._pending_segment_start: Optional[int] = None
    def _get_model_path(self) -> str:
        """Get path to ONNX model, downloading if necessary."""
        cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "silero-vad")
        os.makedirs(cache_dir, exist_ok=True)
        model_path = os.path.join(cache_dir, "silero_vad.onnx")
        if not os.path.exists(model_path):
            print(f"Downloading Silero-VAD ONNX model to {model_path}...")
            urllib.request.urlretrieve(self.ONNX_MODEL_URL, model_path)
            print("Download complete.")
        return model_path
    def _load_model(self) -> None:
        """Load Silero-VAD ONNX model."""
        try:
            import onnxruntime as ort
            model_path = self._get_model_path()
            self._session = ort.InferenceSession(
                model_path,
                providers=['CPUExecutionProvider']
            )
            print(f"Silero-VAD ONNX model loaded from {model_path}")
        except Exception as e:
            raise RuntimeError(f"Failed to load Silero-VAD ONNX model: {e}")
    def _run_inference(self, audio_window: np.ndarray) -> float:
        """Run VAD inference on a single window."""
        # Prepare input
        audio_input = audio_window.reshape(1, -1).astype(np.float32)
        sr_input = np.array([self.sample_rate], dtype=np.int64)
        # Run inference - Silero VAD v5 uses 'state' instead of 'h'/'c'
        outputs = self._session.run(
            ['output', 'stateN'],
            {
                'input': audio_input,
                'sr': sr_input,
                'state': self._state_tensor,
            }
        )
        # Update state
        speech_prob = outputs[0][0][0]
        self._state_tensor = outputs[1]
        return float(speech_prob)
    def reset(self) -> None:
        """Reset VAD state for a new session."""
        with self._lock:
            self._state = VADState()
            self._pending_segment_start = None
            # Reset state tensor
            self._state_tensor = np.zeros((2, 1, 128), dtype=np.float32)
    def process(
        self,
        audio_chunk: np.ndarray,
        return_events: bool = True,
    ) -> Tuple[List[SpeechSegment], List[VADEvent]]:
        """
        Process an audio chunk and detect speech segments.
        Args:
            audio_chunk: Audio data as float32 array
            return_events: Whether to return VAD events
        Returns:
            Tuple of (completed_segments, events)
        """
        if audio_chunk.dtype != np.float32:
            audio_chunk = audio_chunk.astype(np.float32)
        completed_segments: List[SpeechSegment] = []
        events: List[VADEvent] = []
        with self._lock:
            # Process in windows
            chunk_start_sample = self._state.total_samples_processed
            num_windows = len(audio_chunk) // self.window_size
            for i in range(num_windows):
                window_start = i * self.window_size
                window_end = window_start + self.window_size
                window = audio_chunk[window_start:window_end]
                # Check volume (RMS) threshold first
                rms = np.sqrt(np.mean(window ** 2))
                if rms < self.min_volume_threshold:
                    # Volume too low, treat as silence
                    speech_prob = 0.0
                else:
                    # Get speech probability from VAD model
                    speech_prob = self._run_inference(window)
                self._state.last_speech_prob = speech_prob
                current_sample = chunk_start_sample + window_end
                is_speech = speech_prob >= self.threshold
                # State machine for speech detection
                if is_speech:
                    if not self._state.is_speech_active:
                        # Potential speech start
                        if self._pending_segment_start is None:
                            self._pending_segment_start = current_sample - self.window_size
                        # Check if speech duration exceeds minimum
                        speech_duration = current_sample - self._pending_segment_start
                        if speech_duration >= self.min_speech_samples:
                            self._state.is_speech_active = True
                            self._state.speech_start_sample = self._pending_segment_start
                            if return_events:
                                events.append(VADEvent(
                                    type=MessageType.VAD_EVENT,
                                    event=VADEventType.SPEECH_START,
                                    audio_timestamp_sec=self._pending_segment_start / self.sample_rate,
                                ))
                    else:
                        # Continue speech, reset silence counter
                        self._state.silence_start_sample = 0
                else:
                    if self._state.is_speech_active:
                        # Potential speech end
                        if self._state.silence_start_sample == 0:
                            self._state.silence_start_sample = current_sample
                        # Check if silence duration exceeds minimum
                        silence_duration = current_sample - self._state.silence_start_sample
                        if silence_duration >= self.min_silence_samples:
                            # Speech ended - create completed segment
                            segment = SpeechSegment(
                                start_sample=self._state.speech_start_sample,
                                end_sample=self._state.silence_start_sample,
                                start_sec=self._state.speech_start_sample / self.sample_rate,
                                end_sec=self._state.silence_start_sample / self.sample_rate,
                            )
                            completed_segments.append(segment)
                            if return_events:
                                events.append(VADEvent(
                                    type=MessageType.VAD_EVENT,
                                    event=VADEventType.SPEECH_END,
                                    audio_timestamp_sec=self._state.silence_start_sample / self.sample_rate,
                                ))
                            # Reset state
                            self._state.is_speech_active = False
                            self._state.speech_start_sample = 0
                            self._state.silence_start_sample = 0
                            self._pending_segment_start = None
                    else:
                        # No speech, reset pending
                        self._pending_segment_start = None
            # Update total samples processed
            self._state.total_samples_processed += len(audio_chunk)
        return completed_segments, events
    def force_end_speech(self) -> Optional[SpeechSegment]:
        """
        Force end of current speech segment (e.g., when session ends).
        Returns:
            Completed speech segment if speech was active, None otherwise
        """
        with self._lock:
            if self._state.is_speech_active:
                segment = SpeechSegment(
                    start_sample=self._state.speech_start_sample,
                    end_sample=self._state.total_samples_processed,
                    start_sec=self._state.speech_start_sample / self.sample_rate,
                    end_sec=self._state.total_samples_processed / self.sample_rate,
                )
                self._state.is_speech_active = False
                self._state.speech_start_sample = 0
                self._state.silence_start_sample = 0
                self._pending_segment_start = None
                return segment
            return None
    @property
    def is_speech_active(self) -> bool:
        """Check if speech is currently active."""
        with self._lock:
            return self._state.is_speech_active
    @property
    def last_speech_probability(self) -> float:
        """Get the last computed speech probability."""
        with self._lock:
            return self._state.last_speech_prob
    @property
    def current_speech_duration_sec(self) -> float:
        """Get duration of current speech segment (if active)."""
        with self._lock:
            if not self._state.is_speech_active:
                return 0.0
            return (self._state.total_samples_processed - self._state.speech_start_sample) / self.sample_rate
--- a/static/scripts/vibevoice-asr/requirements-realtime.txt
+++ b/static/scripts/vibevoice-asr/requirements-realtime.txt
@ -0,0 +1,7 @@
 # Real-time ASR dependencies
 fastapi>=0.100.0
 uvicorn[standard]>=0.23.0
 websockets>=11.0
 numpy>=1.24.0
 soundfile>=0.12.0
 onnxruntime
--- a/static/scripts/vibevoice-asr/run_all.sh
+++ b/static/scripts/vibevoice-asr/run_all.sh
@ -0,0 +1,73 @@
 #!/bin/bash
 # Run both Gradio demo and Realtime ASR server
 #
 # Usage:
 #   ./run_all.sh
 #
 # Ports:
 #   - 7860: Gradio UI (batch ASR)
 #   - 8000: WebSocket API (realtime ASR)
 set -e
 cd "$(dirname "$0")"
 # Configuration
 GRADIO_HOST="${GRADIO_HOST:-0.0.0.0}"
 GRADIO_PORT="${GRADIO_PORT:-7860}"
 REALTIME_HOST="${REALTIME_HOST:-0.0.0.0}"
 REALTIME_PORT="${REALTIME_PORT:-8000}"
 MODEL_PATH="${VIBEVOICE_MODEL_PATH:-microsoft/VibeVoice-ASR}"
 echo "=========================================="
 echo "VibeVoice ASR - All Services"
 echo "=========================================="
 echo ""
 echo "Starting services:"
 echo "  - Gradio UI:      http://$GRADIO_HOST:$GRADIO_PORT"
 echo "  - Realtime ASR:   http://$REALTIME_HOST:$REALTIME_PORT"
 echo "  - Test Client:    http://$REALTIME_HOST:$REALTIME_PORT/static/realtime_client.html"
 echo ""
 echo "Model: $MODEL_PATH"
 echo "=========================================="
 echo ""
 # Trap to clean up background processes on exit
 cleanup() {
    echo ""
    echo "Shutting down..."
    kill $REALTIME_PID 2>/dev/null || true
    kill $GRADIO_PID 2>/dev/null || true
    wait
    echo "All services stopped."
 }
 trap cleanup EXIT INT TERM
 # Start Realtime ASR server in background
 echo "[1/2] Starting Realtime ASR server..."
 python -m realtime.server \
    --host "$REALTIME_HOST" \
    --port "$REALTIME_PORT" \
    --model-path "$MODEL_PATH" \
    --no-preload &
 REALTIME_PID=$!
 # Wait a moment for the server to initialize
 sleep 2
 # Start Gradio demo in background
 echo "[2/2] Starting Gradio demo..."
 python demo/vibevoice_asr_gradio_demo.py \
    --host "$GRADIO_HOST" \
    --port "$GRADIO_PORT" \
    --model_path "$MODEL_PATH" &
 GRADIO_PID=$!
 echo ""
 echo "Both services started. Press Ctrl+C to stop."
 echo ""
 # Wait for either process to exit
 wait -n $REALTIME_PID $GRADIO_PID
 # If one exits, the trap will clean up the other
--- a/static/scripts/vibevoice-asr/run_realtime.sh
+++ b/static/scripts/vibevoice-asr/run_realtime.sh
@ -0,0 +1,41 @@
 #!/bin/bash
 # Run VibeVoice Realtime ASR Server
 #
 # Usage:
 #   ./run_realtime.sh [options]
 #
 # Options are passed to the server (see --help for details)
 set -e
 cd "$(dirname "$0")"
 # Default options
 HOST="${VIBEVOICE_HOST:-0.0.0.0}"
 PORT="${VIBEVOICE_PORT:-8000}"
 MODEL_PATH="${VIBEVOICE_MODEL_PATH:-microsoft/VibeVoice-ASR}"
 DEVICE="${VIBEVOICE_DEVICE:-cuda}"
 MAX_SESSIONS="${VIBEVOICE_MAX_SESSIONS:-10}"
 echo "=========================================="
 echo "VibeVoice Realtime ASR Server"
 echo "=========================================="
 echo "Host: $HOST"
 echo "Port: $PORT"
 echo "Model: $MODEL_PATH"
 echo "Device: $DEVICE"
 echo "Max Sessions: $MAX_SESSIONS"
 echo "=========================================="
 echo ""
 echo "Web client: http://$HOST:$PORT/static/realtime_client.html"
 echo "WebSocket: ws://$HOST:$PORT/ws/asr/{session_id}"
 echo ""
 # Run server
 python -m realtime.server \
    --host "$HOST" \
    --port "$PORT" \
    --model-path "$MODEL_PATH" \
    --device "$DEVICE" \
    --max-sessions "$MAX_SESSIONS" \
    "$@"
--- a/static/scripts/vibevoice-asr/setup.sh
+++ b/static/scripts/vibevoice-asr/setup.sh
@ -0,0 +1,287 @@
 #!/bin/bash
 # VibeVoice-ASR Setup Script for DGX Spark
 # Downloads and builds the VibeVoice-ASR container
 #
 # Usage:
 #   curl -sL https://docs.techswan.online/scripts/vibevoice-asr/setup.sh | bash
 #   curl -sL https://docs.techswan.online/scripts/vibevoice-asr/setup.sh | bash -s build
 #   curl -sL https://docs.techswan.online/scripts/vibevoice-asr/setup.sh | bash -s serve
 set -e
 BASE_URL="https://docs.techswan.online/scripts/vibevoice-asr"
 INSTALL_DIR="${VIBEVOICE_DIR:-$HOME/vibevoice-asr}"
 IMAGE_NAME="vibevoice-asr:dgx-spark"
 CONTAINER_NAME="vibevoice-asr"
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 log_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
 }
 log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
 }
 log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
 }
 log_step() {
    echo -e "${BLUE}[STEP]${NC} $1"
 }
 download_file() {
    local url="$1"
    local dest="$2"
    local dir=$(dirname "$dest")
    mkdir -p "$dir"
    if command -v curl &> /dev/null; then
        curl -sL "$url" -o "$dest"
    elif command -v wget &> /dev/null; then
        wget -q "$url" -O "$dest"
    else
        log_error "curl or wget is required"
        exit 1
    fi
 }
 download_files() {
    log_step "Downloading VibeVoice-ASR files to $INSTALL_DIR..."
    mkdir -p "$INSTALL_DIR"
    cd "$INSTALL_DIR"
    # Core files
    local files=(
        "Dockerfile"
        "requirements-realtime.txt"
        "test_vibevoice.py"
        "vibevoice_asr_gradio_demo_patched.py"
        "run_realtime.sh"
        "run_all.sh"
    )
    for file in "${files[@]}"; do
        log_info "Downloading $file..."
        download_file "$BASE_URL/$file" "$INSTALL_DIR/$file"
    done
    # Realtime module
    local realtime_files=(
        "__init__.py"
        "models.py"
        "server.py"
        "asr_worker.py"
        "session_manager.py"
        "audio_buffer.py"
        "vad_processor.py"
    )
    mkdir -p "$INSTALL_DIR/realtime"
    for file in "${realtime_files[@]}"; do
        log_info "Downloading realtime/$file..."
        download_file "$BASE_URL/realtime/$file" "$INSTALL_DIR/realtime/$file"
    done
    # Static files
    mkdir -p "$INSTALL_DIR/static"
    log_info "Downloading static/realtime_client.html..."
    download_file "$BASE_URL/static/realtime_client.html" "$INSTALL_DIR/static/realtime_client.html"
    # Make scripts executable
    chmod +x "$INSTALL_DIR/run_realtime.sh" "$INSTALL_DIR/run_all.sh"
    log_info "All files downloaded to $INSTALL_DIR"
 }
 check_prerequisites() {
    log_step "Checking prerequisites..."
    # Check Docker
    if ! command -v docker &> /dev/null; then
        log_error "Docker is not installed"
        exit 1
    fi
    # Check NVIDIA Docker runtime
    if ! docker info 2>/dev/null | grep -q "Runtimes.*nvidia"; then
        log_warn "NVIDIA Docker runtime may not be configured"
    fi
    # Check GPU availability
    if command -v nvidia-smi &> /dev/null; then
        log_info "GPU detected:"
        nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
    else
        log_warn "nvidia-smi not found on host"
    fi
    log_info "Prerequisites check complete"
 }
 build_image() {
    log_step "Building Docker image: ${IMAGE_NAME}"
    log_info "This may take several minutes..."
    cd "$INSTALL_DIR"
    docker build \
        --network=host \
        -t "$IMAGE_NAME" \
        -f Dockerfile \
        .
    log_info "Docker image built successfully: ${IMAGE_NAME}"
 }
 run_container() {
    local mode="${1:-interactive}"
    log_step "Running container in ${mode} mode..."
    # Stop existing container if running
    if docker ps -q -f name="$CONTAINER_NAME" | grep -q .; then
        log_warn "Stopping existing container..."
        docker stop "$CONTAINER_NAME" 2>/dev/null || true
    fi
    # Remove existing container
    docker rm "$CONTAINER_NAME" 2>/dev/null || true
    # Common Docker options for DGX Spark
    local docker_opts=(
        --gpus all
        --ipc=host
        --network=host
        --ulimit memlock=-1:-1
        --ulimit stack=-1:-1
        -e "PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
        -v "$HOME/.cache/huggingface:/root/.cache/huggingface"
        --name "$CONTAINER_NAME"
    )
    if [ "$mode" = "interactive" ]; then
        docker run --rm -it "${docker_opts[@]}" "$IMAGE_NAME" bash
    elif [ "$mode" = "test" ]; then
        docker run --rm "${docker_opts[@]}" "$IMAGE_NAME" python /workspace/test_vibevoice.py
    elif [ "$mode" = "demo" ]; then
        log_info "Starting Gradio demo on port 7860..."
        log_info "Access the demo at: http://localhost:7860"
        docker run --rm -it "${docker_opts[@]}" "$IMAGE_NAME"
    elif [ "$mode" = "realtime" ]; then
        log_info "Starting Realtime ASR server on port 8000..."
        log_info "WebSocket API: ws://localhost:8000/ws/asr/{session_id}"
        log_info "Test client:   http://localhost:8000/static/realtime_client.html"
        docker run --rm -it "${docker_opts[@]}" "$IMAGE_NAME" \
            python -m realtime.server --host 0.0.0.0 --port 8000
    elif [ "$mode" = "serve" ]; then
        log_info "Starting all services..."
        log_info "  Gradio demo:    http://localhost:7860"
        log_info "  Realtime ASR:   http://localhost:8000"
        log_info "  Test client:    http://localhost:8000/static/realtime_client.html"
        docker run --rm -it "${docker_opts[@]}" "$IMAGE_NAME" ./run_all.sh
    else
        log_error "Unknown mode: $mode"
        exit 1
    fi
 }
 show_usage() {
    echo "VibeVoice-ASR Setup for DGX Spark"
    echo ""
    echo "Usage:"
    echo "  curl -sL $BASE_URL/setup.sh | bash              # Download only"
    echo "  curl -sL $BASE_URL/setup.sh | bash -s build     # Download and build"
    echo "  curl -sL $BASE_URL/setup.sh | bash -s demo      # Download, build, run demo"
    echo "  curl -sL $BASE_URL/setup.sh | bash -s serve     # Download, build, run all"
    echo ""
    echo "Commands:"
    echo "  (default)   Download files only"
    echo "  build       Download and build Docker image"
    echo "  demo        Download, build, and start Gradio demo (port 7860)"
    echo "  realtime    Download, build, and start Realtime ASR (port 8000)"
    echo "  serve       Download, build, and start both services"
    echo "  run         Run container interactively (after build)"
    echo ""
    echo "Environment variables:"
    echo "  VIBEVOICE_DIR   Installation directory (default: ~/vibevoice-asr)"
    echo ""
    echo "After installation, you can also run:"
    echo "  cd ~/vibevoice-asr"
    echo "  docker run --gpus all -p 7860:7860 vibevoice-asr:dgx-spark"
 }
 main() {
    local command="${1:-download}"
    echo ""
    echo "=========================================="
    echo "  VibeVoice-ASR Setup for DGX Spark"
    echo "=========================================="
    echo ""
    case "$command" in
        download)
            download_files
            echo ""
            log_info "Done! Next steps:"
            echo "  cd $INSTALL_DIR"
            echo "  docker build -t vibevoice-asr:dgx-spark ."
            echo "  docker run --gpus all -p 7860:7860 vibevoice-asr:dgx-spark"
            ;;
        build)
            download_files
            check_prerequisites
            build_image
            echo ""
            log_info "Done! To run:"
            echo "  cd $INSTALL_DIR"
            echo "  docker run --gpus all -p 7860:7860 vibevoice-asr:dgx-spark"
            ;;
        run)
            cd "$INSTALL_DIR" 2>/dev/null || { log_error "Run 'build' first"; exit 1; }
            run_container interactive
            ;;
        test)
            cd "$INSTALL_DIR" 2>/dev/null || { log_error "Run 'build' first"; exit 1; }
            run_container test
            ;;
        demo)
            download_files
            check_prerequisites
            build_image
            run_container demo
            ;;
        realtime)
            download_files
            check_prerequisites
            build_image
            run_container realtime
            ;;
        serve)
            download_files
            check_prerequisites
            build_image
            run_container serve
            ;;
        -h|--help|help)
            show_usage
            ;;
        *)
            log_error "Unknown command: $command"
            show_usage
            exit 1
            ;;
    esac
 }
 main "$@"
--- a/static/scripts/vibevoice-asr/static/realtime_client.html
+++ b/static/scripts/vibevoice-asr/static/realtime_client.html
@ -0,0 +1,899 @@
 <!DOCTYPE html>
 <html lang="ja">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>VibeVoice Realtime ASR Client</title>
    <style>
        :root {
            --bg-primary: #1a1a2e;
            --bg-secondary: #16213e;
            --bg-tertiary: #0f3460;
            --text-primary: #eaeaea;
            --text-secondary: #a0a0a0;
            --accent: #e94560;
            --success: #4ade80;
            --warning: #fbbf24;
            --info: #60a5fa;
        }
        * {
            box-sizing: border-box;
            margin: 0;
            padding: 0;
        }
        body {
            font-family: 'Segoe UI', system-ui, sans-serif;
            background: var(--bg-primary);
            color: var(--text-primary);
            min-height: 100vh;
            padding: 20px;
        }
        .container {
            max-width: 1200px;
            margin: 0 auto;
        }
        header {
            text-align: center;
            margin-bottom: 30px;
        }
        h1 {
            font-size: 2rem;
            margin-bottom: 10px;
        }
        .subtitle {
            color: var(--text-secondary);
        }
        .main-grid {
            display: grid;
            grid-template-columns: 1fr 1fr;
            gap: 20px;
        }
        @media (max-width: 768px) {
            .main-grid {
                grid-template-columns: 1fr;
            }
        }
        .card {
            background: var(--bg-secondary);
            border-radius: 12px;
            padding: 20px;
        }
        .card-title {
            font-size: 1.1rem;
            margin-bottom: 15px;
            display: flex;
            align-items: center;
            gap: 8px;
        }
        /* Controls */
        .controls {
            display: flex;
            flex-direction: column;
            gap: 15px;
        }
        .control-row {
            display: flex;
            gap: 10px;
            align-items: center;
        }
        input[type="text"], input[type="number"] {
            background: var(--bg-tertiary);
            border: 1px solid rgba(255,255,255,0.1);
            border-radius: 8px;
            padding: 10px 15px;
            color: var(--text-primary);
            flex: 1;
        }
        input:focus {
            outline: none;
            border-color: var(--accent);
        }
        button {
            background: var(--accent);
            border: none;
            border-radius: 8px;
            padding: 12px 24px;
            color: white;
            font-weight: 600;
            cursor: pointer;
            transition: all 0.2s;
        }
        button:hover {
            filter: brightness(1.1);
        }
        button:disabled {
            opacity: 0.5;
            cursor: not-allowed;
        }
        button.secondary {
            background: var(--bg-tertiary);
        }
        button.success {
            background: var(--success);
            color: #000;
        }
        button.stop {
            background: #ef4444;
        }
        /* Status */
        .status-bar {
            display: flex;
            align-items: center;
            gap: 10px;
            padding: 10px 15px;
            background: var(--bg-tertiary);
            border-radius: 8px;
            margin-bottom: 15px;
        }
        .status-indicator {
            width: 12px;
            height: 12px;
            border-radius: 50%;
            background: var(--text-secondary);
        }
        .status-indicator.connected {
            background: var(--success);
            box-shadow: 0 0 10px var(--success);
        }
        .status-indicator.recording {
            background: var(--accent);
            box-shadow: 0 0 10px var(--accent);
            animation: pulse 1s infinite;
        }
        @keyframes pulse {
            0%, 100% { opacity: 1; }
            50% { opacity: 0.5; }
        }
        /* Transcription output */
        .transcription-box {
            background: var(--bg-tertiary);
            border-radius: 8px;
            padding: 15px;
            min-height: 300px;
            max-height: 500px;
            overflow-y: auto;
            font-family: 'Consolas', monospace;
            line-height: 1.6;
        }
        .segment {
            margin-bottom: 15px;
            padding: 10px;
            background: rgba(255,255,255,0.05);
            border-radius: 6px;
            border-left: 3px solid var(--accent);
        }
        .segment.partial {
            border-left-color: var(--warning);
            opacity: 0.8;
        }
        .segment-meta {
            font-size: 0.8rem;
            color: var(--text-secondary);
            margin-bottom: 5px;
        }
        .segment-text {
            font-size: 1rem;
        }
        /* Audio visualizer */
        .visualizer-container {
            height: 80px;
            background: var(--bg-tertiary);
            border-radius: 8px;
            overflow: hidden;
            margin-bottom: 15px;
        }
        #visualizer {
            width: 100%;
            height: 100%;
        }
        /* VAD indicator */
        .vad-indicator {
            display: flex;
            align-items: center;
            gap: 10px;
            padding: 10px;
            background: var(--bg-tertiary);
            border-radius: 8px;
            margin-bottom: 15px;
        }
        .vad-bar {
            flex: 1;
            height: 8px;
            background: rgba(255,255,255,0.1);
            border-radius: 4px;
            overflow: hidden;
        }
        .vad-level {
            height: 100%;
            background: var(--success);
            width: 0%;
            transition: width 0.1s;
        }
        .vad-level.speech {
            background: var(--accent);
        }
        /* Stats */
        .stats {
            display: grid;
            grid-template-columns: repeat(3, 1fr);
            gap: 10px;
            margin-top: 15px;
        }
        .stat-item {
            background: var(--bg-tertiary);
            padding: 10px;
            border-radius: 8px;
            text-align: center;
        }
        .stat-value {
            font-size: 1.5rem;
            font-weight: bold;
            color: var(--accent);
        }
        .stat-label {
            font-size: 0.75rem;
            color: var(--text-secondary);
        }
        /* Log */
        .log-box {
            background: var(--bg-tertiary);
            border-radius: 8px;
            padding: 10px;
            height: 150px;
            overflow-y: auto;
            font-family: 'Consolas', monospace;
            font-size: 0.8rem;
        }
        .log-entry {
            margin-bottom: 2px;
            color: var(--text-secondary);
        }
        .log-entry.error {
            color: #ef4444;
        }
        .log-entry.success {
            color: var(--success);
        }
        .log-entry.info {
            color: var(--info);
        }
        /* Settings panel */
        .settings-group {
            margin-top: 15px;
            padding: 15px;
            background: var(--bg-tertiary);
            border-radius: 8px;
        }
        .settings-group h4 {
            margin-bottom: 10px;
            font-size: 0.9rem;
            color: var(--text-secondary);
        }
        .setting-item {
            margin-bottom: 12px;
        }
        .setting-item label {
            display: block;
            font-size: 0.85rem;
            margin-bottom: 5px;
            color: var(--text-secondary);
        }
        .setting-item input[type="range"] {
            width: 100%;
            height: 6px;
            border-radius: 3px;
            background: rgba(255,255,255,0.1);
            outline: none;
            -webkit-appearance: none;
        }
        .setting-item input[type="range"]::-webkit-slider-thumb {
            -webkit-appearance: none;
            width: 16px;
            height: 16px;
            border-radius: 50%;
            background: var(--accent);
            cursor: pointer;
        }
        .setting-value {
            font-size: 0.8rem;
            color: var(--accent);
            float: right;
        }
    </style>
 </head>
 <body>
    <div class="container">
        <header>
            <h1>🎙️ VibeVoice Realtime ASR</h1>
            <p class="subtitle">リアルタイム音声認識デモ</p>
        </header>
        <div class="status-bar">
            <div class="status-indicator" id="statusIndicator"></div>
            <span id="statusText">未接続</span>
        </div>
        <div class="main-grid">
            <!-- Left column: Controls -->
            <div class="card">
                <h2 class="card-title">⚙️ 設定</h2>
                <div class="controls">
                    <div class="control-row">
                        <input type="text" id="serverUrl" placeholder="WebSocket URL"
                               value="ws://localhost:8000/ws/asr/demo">
                    </div>
                    <div class="control-row">
                        <button id="connectBtn" onclick="toggleConnection()">接続</button>
                        <button id="recordBtn" class="success" onclick="toggleRecording()" disabled>
                            🎤 録音開始
                        </button>
                    </div>
                </div>
                <div class="visualizer-container">
                    <canvas id="visualizer"></canvas>
                </div>
                <div class="vad-indicator">
                    <span>VAD:</span>
                    <div class="vad-bar">
                        <div class="vad-level" id="vadLevel"></div>
                    </div>
                    <span id="vadStatus">待機中</span>
                </div>
                <div class="settings-group">
                    <h4>🎚️ VAD設定</h4>
                    <div class="setting-item">
                        <label>
                            音声検出閾値
                            <span class="setting-value" id="vadThresholdValue">0.5</span>
                        </label>
                        <input type="range" id="vadThreshold" min="0.1" max="0.9" step="0.1" value="0.5"
                               onchange="updateConfig()">
                    </div>
                    <div class="setting-item">
                        <label>
                            最小発話時間 (ms)
                            <span class="setting-value" id="minSpeechValue">250</span>
                        </label>
                        <input type="range" id="minSpeechDuration" min="100" max="1000" step="50" value="250"
                               onchange="updateConfig()">
                    </div>
                    <div class="setting-item">
                        <label>
                            無音判定時間 (ms)
                            <span class="setting-value" id="minSilenceValue">500</span>
                        </label>
                        <input type="range" id="minSilenceDuration" min="200" max="2000" step="100" value="500"
                               onchange="updateConfig()">
                    </div>
                    <div class="setting-item">
                        <label>
                            最小音量閾値
                            <span class="setting-value" id="minVolumeValue">0.01</span>
                        </label>
                        <input type="range" id="minVolumeThreshold" min="0.001" max="0.1" step="0.001" value="0.01"
                               onchange="updateConfig()">
                    </div>
                </div>
                <div class="stats">
                    <div class="stat-item">
                        <div class="stat-value" id="statDuration">0.0</div>
                        <div class="stat-label">録音時間 (秒)</div>
                    </div>
                    <div class="stat-item">
                        <div class="stat-value" id="statSegments">0</div>
                        <div class="stat-label">セグメント数</div>
                    </div>
                    <div class="stat-item">
                        <div class="stat-value" id="statLatency">-</div>
                        <div class="stat-label">レイテンシ (ms)</div>
                    </div>
                </div>
                <h3 class="card-title" style="margin-top: 20px;">📋 ログ</h3>
                <div class="log-box" id="logBox"></div>
            </div>
            <!-- Right column: Transcription -->
            <div class="card">
                <h2 class="card-title">📝 認識結果</h2>
                <div class="control-row" style="margin-bottom: 15px;">
                    <button class="secondary" onclick="clearTranscription()">クリア</button>
                    <button class="secondary" onclick="copyTranscription()">コピー</button>
                </div>
                <div class="transcription-box" id="transcriptionBox">
                    <p style="color: var(--text-secondary); text-align: center; padding: 50px;">
                        接続して録音を開始すると、ここに認識結果が表示されます
                    </p>
                </div>
            </div>
        </div>
    </div>
    <script>
        // State
        let websocket = null;
        let mediaStream = null;
        let audioContext = null;
        let processor = null;
        let analyser = null;
        let isRecording = false;
        let recordingStartTime = null;
        let segmentCount = 0;
        let currentPartialText = '';
        // DOM elements
        const statusIndicator = document.getElementById('statusIndicator');
        const statusText = document.getElementById('statusText');
        const connectBtn = document.getElementById('connectBtn');
        const recordBtn = document.getElementById('recordBtn');
        const transcriptionBox = document.getElementById('transcriptionBox');
        const logBox = document.getElementById('logBox');
        const vadLevel = document.getElementById('vadLevel');
        const vadStatus = document.getElementById('vadStatus');
        // Logging
        function log(message, type = 'info') {
            const entry = document.createElement('div');
            entry.className = `log-entry ${type}`;
            entry.textContent = `[${new Date().toLocaleTimeString()}] ${message}`;
            logBox.appendChild(entry);
            logBox.scrollTop = logBox.scrollHeight;
            console.log(`[${type}] ${message}`);
        }
        // Update config display values
        function updateConfigDisplay() {
            document.getElementById('vadThresholdValue').textContent =
                document.getElementById('vadThreshold').value;
            document.getElementById('minSpeechValue').textContent =
                document.getElementById('minSpeechDuration').value;
            document.getElementById('minSilenceValue').textContent =
                document.getElementById('minSilenceDuration').value;
            document.getElementById('minVolumeValue').textContent =
                document.getElementById('minVolumeThreshold').value;
        }
        // Send config to server
        function updateConfig() {
            updateConfigDisplay();
            if (websocket && websocket.readyState === WebSocket.OPEN) {
                const config = {
                    type: 'config',
                    config: {
                        vad_threshold: parseFloat(document.getElementById('vadThreshold').value),
                        min_speech_duration_ms: parseInt(document.getElementById('minSpeechDuration').value),
                        min_silence_duration_ms: parseInt(document.getElementById('minSilenceDuration').value),
                        min_volume_threshold: parseFloat(document.getElementById('minVolumeThreshold').value),
                    }
                };
                websocket.send(JSON.stringify(config));
                log(`VAD設定を更新: 閾値=${config.config.vad_threshold}, 最小発話=${config.config.min_speech_duration_ms}ms, 無音判定=${config.config.min_silence_duration_ms}ms, 最小音量=${config.config.min_volume_threshold}`, 'info');
            }
        }
        // Initialize config display on load
        document.addEventListener('DOMContentLoaded', updateConfigDisplay);
        // Connection
        function toggleConnection() {
            if (websocket && websocket.readyState === WebSocket.OPEN) {
                disconnect();
            } else {
                connect();
            }
        }
        function connect() {
            const url = document.getElementById('serverUrl').value;
            log(`接続中: ${url}`, 'info');
            try {
                websocket = new WebSocket(url);
                websocket.onopen = () => {
                    log('接続成功', 'success');
                    statusIndicator.classList.add('connected');
                    statusText.textContent = '接続済み';
                    connectBtn.textContent = '切断';
                    recordBtn.disabled = false;
                };
                websocket.onclose = () => {
                    log('切断されました', 'info');
                    handleDisconnect();
                };
                websocket.onerror = (error) => {
                    log(`エラー: ${error}`, 'error');
                };
                websocket.onmessage = (event) => {
                    handleMessage(JSON.parse(event.data));
                };
            } catch (error) {
                log(`接続エラー: ${error}`, 'error');
            }
        }
        function disconnect() {
            if (isRecording) {
                stopRecording();
            }
            if (websocket) {
                websocket.close();
            }
            handleDisconnect();
        }
        function handleDisconnect() {
            statusIndicator.classList.remove('connected', 'recording');
            statusText.textContent = '未接続';
            connectBtn.textContent = '接続';
            recordBtn.disabled = true;
            websocket = null;
        }
        // Message handling
        function handleMessage(data) {
            switch (data.type) {
                case 'status':
                    log(`ステータス: ${data.message}`, 'info');
                    break;
                case 'partial_result':
                    updatePartialResult(data);
                    break;
                case 'final_result':
                    addFinalResult(data);
                    break;
                case 'vad_event':
                    handleVADEvent(data);
                    break;
                case 'error':
                    log(`サーバーエラー: ${data.error}`, 'error');
                    break;
                case 'pong':
                    // Heartbeat response
                    break;
                default:
                    log(`不明なメッセージ: ${data.type}`, 'info');
            }
        }
        function updatePartialResult(data) {
            currentPartialText = data.text;
            updateTranscriptionDisplay();
            if (data.latency_ms) {
                document.getElementById('statLatency').textContent =
                    Math.round(data.latency_ms);
            }
        }
        function addFinalResult(data) {
            currentPartialText = '';
            segmentCount++;
            document.getElementById('statSegments').textContent = segmentCount;
            if (data.latency_ms) {
                document.getElementById('statLatency').textContent =
                    Math.round(data.latency_ms);
            }
            // Add final segment to display
            const segment = document.createElement('div');
            segment.className = 'segment';
            let metaText = '';
            if (data.segments && data.segments.length > 0) {
                const seg = data.segments[0];
                metaText = `[${seg.start_time?.toFixed(2) || '?'}s - ${seg.end_time?.toFixed(2) || '?'}s] ${seg.speaker_id || ''}`;
            }
            segment.innerHTML = `
                <div class="segment-meta">${metaText}</div>
                <div class="segment-text">${data.text}</div>
            `;
            // Remove placeholder if exists
            const placeholder = transcriptionBox.querySelector('p');
            if (placeholder) placeholder.remove();
            transcriptionBox.appendChild(segment);
            transcriptionBox.scrollTop = transcriptionBox.scrollHeight;
            log(`認識完了: "${data.text.substring(0, 30)}..."`, 'success');
        }
        function updateTranscriptionDisplay() {
            // Update or create partial display
            let partialDiv = transcriptionBox.querySelector('.segment.partial');
            if (currentPartialText) {
                if (!partialDiv) {
                    partialDiv = document.createElement('div');
                    partialDiv.className = 'segment partial';
                    transcriptionBox.appendChild(partialDiv);
                }
                partialDiv.innerHTML = `
                    <div class="segment-meta">認識中...</div>
                    <div class="segment-text">${currentPartialText}</div>
                `;
                transcriptionBox.scrollTop = transcriptionBox.scrollHeight;
            } else if (partialDiv) {
                partialDiv.remove();
            }
        }
        function handleVADEvent(data) {
            if (data.event === 'speech_start') {
                vadLevel.classList.add('speech');
                vadStatus.textContent = '発話中';
                log(`発話開始 @ ${data.audio_timestamp_sec?.toFixed(2)}s`, 'info');
            } else if (data.event === 'speech_end') {
                vadLevel.classList.remove('speech');
                vadStatus.textContent = '待機中';
                log(`発話終了 @ ${data.audio_timestamp_sec?.toFixed(2)}s`, 'info');
            }
        }
        // Recording
        async function toggleRecording() {
            if (isRecording) {
                stopRecording();
            } else {
                await startRecording();
            }
        }
        async function startRecording() {
            try {
                log('マイクアクセスをリクエスト中...', 'info');
                mediaStream = await navigator.mediaDevices.getUserMedia({
                    audio: {
                        sampleRate: 16000,
                        channelCount: 1,
                        echoCancellation: true,
                        noiseSuppression: true,
                    }
                });
                audioContext = new (window.AudioContext || window.webkitAudioContext)({
                    sampleRate: 16000
                });
                const source = audioContext.createMediaStreamSource(mediaStream);
                // Analyser for visualization
                analyser = audioContext.createAnalyser();
                analyser.fftSize = 256;
                source.connect(analyser);
                // ScriptProcessor for sending audio
                const bufferSize = 4096;
                processor = audioContext.createScriptProcessor(bufferSize, 1, 1);
                processor.onaudioprocess = (e) => {
                    if (!isRecording || !websocket || websocket.readyState !== WebSocket.OPEN) {
                        return;
                    }
                    const inputData = e.inputBuffer.getChannelData(0);
                    // Convert to 16-bit PCM
                    const pcmData = new Int16Array(inputData.length);
                    for (let i = 0; i < inputData.length; i++) {
                        pcmData[i] = Math.max(-32768, Math.min(32767, inputData[i] * 32768));
                    }
                    // Send as binary
                    websocket.send(pcmData.buffer);
                };
                source.connect(processor);
                processor.connect(audioContext.destination);
                isRecording = true;
                recordingStartTime = Date.now();
                statusIndicator.classList.add('recording');
                recordBtn.textContent = '⏹️ 録音停止';
                recordBtn.classList.remove('success');
                recordBtn.classList.add('stop');
                // Start visualization
                visualize();
                updateDuration();
                log('録音開始', 'success');
            } catch (error) {
                log(`マイクエラー: ${error}`, 'error');
            }
        }
        function stopRecording() {
            isRecording = false;
            if (processor) {
                processor.disconnect();
                processor = null;
            }
            if (audioContext) {
                audioContext.close();
                audioContext = null;
            }
            if (mediaStream) {
                mediaStream.getTracks().forEach(track => track.stop());
                mediaStream = null;
            }
            statusIndicator.classList.remove('recording');
            recordBtn.textContent = '🎤 録音開始';
            recordBtn.classList.remove('stop');
            recordBtn.classList.add('success');
            // Send stop message
            if (websocket && websocket.readyState === WebSocket.OPEN) {
                websocket.send(JSON.stringify({ type: 'stop' }));
            }
            log('録音停止', 'info');
        }
        // Visualization
        function visualize() {
            if (!analyser || !isRecording) return;
            const canvas = document.getElementById('visualizer');
            const ctx = canvas.getContext('2d');
            const width = canvas.width = canvas.offsetWidth;
            const height = canvas.height = canvas.offsetHeight;
            const bufferLength = analyser.frequencyBinCount;
            const dataArray = new Uint8Array(bufferLength);
            function draw() {
                if (!isRecording) return;
                requestAnimationFrame(draw);
                analyser.getByteFrequencyData(dataArray);
                ctx.fillStyle = 'rgb(15, 52, 96)';
                ctx.fillRect(0, 0, width, height);
                const barWidth = (width / bufferLength) * 2.5;
                let x = 0;
                // Calculate average for VAD indicator
                let sum = 0;
                for (let i = 0; i < bufferLength; i++) {
                    const barHeight = (dataArray[i] / 255) * height;
                    const gradient = ctx.createLinearGradient(0, height, 0, height - barHeight);
                    gradient.addColorStop(0, '#e94560');
                    gradient.addColorStop(1, '#4ade80');
                    ctx.fillStyle = gradient;
                    ctx.fillRect(x, height - barHeight, barWidth, barHeight);
                    x += barWidth + 1;
                    sum += dataArray[i];
                }
                // Update VAD level indicator
                const avgLevel = (sum / bufferLength / 255) * 100;
                vadLevel.style.width = `${avgLevel}%`;
            }
            draw();
        }
        function updateDuration() {
            if (!isRecording) return;
            const duration = (Date.now() - recordingStartTime) / 1000;
            document.getElementById('statDuration').textContent = duration.toFixed(1);
            requestAnimationFrame(updateDuration);
        }
        // Utility functions
        function clearTranscription() {
            transcriptionBox.innerHTML = `
                <p style="color: var(--text-secondary); text-align: center; padding: 50px;">
                    接続して録音を開始すると、ここに認識結果が表示されます
                </p>
            `;
            segmentCount = 0;
            currentPartialText = '';
            document.getElementById('statSegments').textContent = '0';
            document.getElementById('statLatency').textContent = '-';
            log('認識結果をクリアしました', 'info');
        }
        function copyTranscription() {
            const segments = transcriptionBox.querySelectorAll('.segment:not(.partial) .segment-text');
            const text = Array.from(segments).map(s => s.textContent).join('\n');
            if (text) {
                navigator.clipboard.writeText(text).then(() => {
                    log('認識結果をコピーしました', 'success');
                });
            }
        }
        // Heartbeat
        setInterval(() => {
            if (websocket && websocket.readyState === WebSocket.OPEN) {
                websocket.send(JSON.stringify({ type: 'ping' }));
            }
        }, 30000);
    </script>
 </body>
 </html>
--- a/static/scripts/vibevoice-asr/test_vibevoice.py
+++ b/static/scripts/vibevoice-asr/test_vibevoice.py
@ -0,0 +1,173 @@
 #!/usr/bin/env python3
 """
 VibeVoice-ASR Test Script for DGX Spark
 Tests basic functionality and GPU availability
 """
 import sys
 import subprocess
 def test_imports():
    """Test that VibeVoice can be imported"""
    print("=" * 60)
    print("Testing VibeVoice imports...")
    print("=" * 60)
    try:
        import vibevoice
        print("[OK] vibevoice imported successfully")
        return True
    except ImportError as e:
        print(f"[FAIL] Failed to import vibevoice: {e}")
        return False
 def test_torch_cuda():
    """Test PyTorch CUDA availability"""
    print("\n" + "=" * 60)
    print("Testing PyTorch CUDA...")
    print("=" * 60)
    try:
        import torch
        print(f"[INFO] PyTorch version: {torch.__version__}")
        print(f"[INFO] CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"[INFO] CUDA version: {torch.version.cuda}")
            print(f"[INFO] GPU count: {torch.cuda.device_count()}")
            for i in range(torch.cuda.device_count()):
                props = torch.cuda.get_device_properties(i)
                print(f"[INFO] GPU {i}: {props.name}")
                print(f"       Compute capability: {props.major}.{props.minor}")
                print(f"       Total memory: {props.total_memory / 1024**3:.1f} GB")
            # Quick CUDA test
            x = torch.randn(100, 100, device='cuda')
            y = torch.matmul(x, x)
            print(f"[OK] CUDA tensor operations working")
            return True
        else:
            print("[WARN] CUDA not available")
            return False
    except Exception as e:
        print(f"[FAIL] PyTorch CUDA test failed: {e}")
        return False
 def test_flash_attention():
    """Test flash attention availability"""
    print("\n" + "=" * 60)
    print("Testing Flash Attention...")
    print("=" * 60)
    try:
        import flash_attn
        print(f"[OK] flash_attn version: {flash_attn.__version__}")
        return True
    except ImportError:
        print("[WARN] flash_attn not installed (optional)")
        return True  # Not required
 def test_ffmpeg():
    """Test FFmpeg availability"""
    print("\n" + "=" * 60)
    print("Testing FFmpeg...")
    print("=" * 60)
    try:
        result = subprocess.run(
            ["ffmpeg", "-version"],
            capture_output=True,
            text=True
        )
        if result.returncode == 0:
            version_line = result.stdout.split('\n')[0]
            print(f"[OK] {version_line}")
            return True
        else:
            print("[FAIL] FFmpeg returned error")
            return False
    except FileNotFoundError:
        print("[FAIL] FFmpeg not found")
        return False
 def test_asr_model():
    """Test loading ASR model (if GPU available)"""
    print("\n" + "=" * 60)
    print("Testing ASR Model Loading...")
    print("=" * 60)
    try:
        import torch
        if not torch.cuda.is_available():
            print("[SKIP] Skipping model test - no GPU available")
            return True
        # Try to load the ASR pipeline
        from vibevoice import ASRPipeline
        print("[INFO] Loading ASR pipeline...")
        # Use smaller model for testing
        pipeline = ASRPipeline()
        print("[OK] ASR pipeline loaded successfully")
        # Clean up
        del pipeline
        torch.cuda.empty_cache()
        return True
    except ImportError as e:
        print(f"[WARN] ASRPipeline not available: {e}")
        print("[INFO] This may be normal depending on VibeVoice version")
        return True
    except Exception as e:
        print(f"[WARN] ASR model test: {e}")
        return True
 def main():
    """Run all tests"""
    print("\n")
    print("*" * 60)
    print("  VibeVoice-ASR Test Suite for DGX Spark")
    print("*" * 60)
    results = {
        "imports": test_imports(),
        "torch_cuda": test_torch_cuda(),
        "flash_attention": test_flash_attention(),
        "ffmpeg": test_ffmpeg(),
        "asr_model": test_asr_model(),
    }
    print("\n")
    print("=" * 60)
    print("Test Summary")
    print("=" * 60)
    all_passed = True
    for name, passed in results.items():
        status = "[OK]" if passed else "[FAIL]"
        print(f"  {status} {name}")
        if not passed:
            all_passed = False
    print("=" * 60)
    if all_passed:
        print("\nAll tests passed!")
        return 0
    else:
        print("\nSome tests failed.")
        return 1
 if __name__ == "__main__":
    sys.exit(main())
--- a/static/scripts/vibevoice-asr/vibevoice_asr_gradio_demo_patched.py
+++ b/static/scripts/vibevoice-asr/vibevoice_asr_gradio_demo_patched.py