diff --git a/static/scripts/vibevoice-asr/Dockerfile b/static/scripts/vibevoice-asr/Dockerfile
new file mode 100644
index 0000000..b01c1b6
--- /dev/null
+++ b/static/scripts/vibevoice-asr/Dockerfile
@@ -0,0 +1,61 @@
+# VibeVoice-ASR for DGX Spark (ARM64, Blackwell GB10, sm_121)
+# Based on NVIDIA PyTorch container for CUDA 13.1 compatibility
+
+ARG TARGETARCH
+FROM nvcr.io/nvidia/pytorch:25.11-py3 AS base
+
+LABEL maintainer="VibeVoice-ASR DGX Spark Setup"
+LABEL description="VibeVoice-ASR optimized for DGX Spark (ARM64, CUDA 13.1)"
+
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# PyTorch CUDA settings for DGX Spark
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+ENV USE_LIBUV=0
+
+# Set working directory
+WORKDIR /workspace
+
+# Install system dependencies including FFmpeg for demo
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    git \
+    curl \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install flash-attn if not already present
+RUN pip install --no-cache-dir flash-attn --no-build-isolation || true
+
+# Clone and install VibeVoice
+RUN git clone https://github.com/microsoft/VibeVoice.git /workspace/VibeVoice && \
+    cd /workspace/VibeVoice && \
+    pip install --no-cache-dir -e .
+
+# Create test script and patched demo with MKV support
+COPY test_vibevoice.py /workspace/test_vibevoice.py
+COPY vibevoice_asr_gradio_demo_patched.py /workspace/VibeVoice/demo/vibevoice_asr_gradio_demo.py
+
+# Install real-time ASR dependencies
+COPY requirements-realtime.txt /workspace/requirements-realtime.txt
+RUN pip install --no-cache-dir -r /workspace/requirements-realtime.txt
+
+# Copy real-time ASR module and startup scripts
+COPY realtime/ /workspace/VibeVoice/realtime/
+COPY static/ /workspace/VibeVoice/static/
+COPY run_all.sh /workspace/VibeVoice/run_all.sh
+COPY run_realtime.sh /workspace/VibeVoice/run_realtime.sh
+RUN chmod +x /workspace/VibeVoice/run_all.sh /workspace/VibeVoice/run_realtime.sh
+
+# Set default working directory to VibeVoice
+WORKDIR /workspace/VibeVoice
+
+# Expose Gradio port and WebSocket port
+EXPOSE 7860
+EXPOSE 8000
+
+# Default command: Launch Gradio demo with MKV support
+CMD ["python", "demo/vibevoice_asr_gradio_demo.py", "--model_path", "microsoft/VibeVoice-ASR", "--host", "0.0.0.0"]
diff --git a/static/scripts/vibevoice-asr/realtime/__init__.py b/static/scripts/vibevoice-asr/realtime/__init__.py
new file mode 100644
index 0000000..0fbe8cb
--- /dev/null
+++ b/static/scripts/vibevoice-asr/realtime/__init__.py
@@ -0,0 +1,7 @@
+"""
+VibeVoice Realtime ASR Module
+
+WebSocket-based real-time speech recognition using VibeVoice ASR.
+"""
+
+__version__ = "0.1.0"
diff --git a/static/scripts/vibevoice-asr/realtime/asr_worker.py b/static/scripts/vibevoice-asr/realtime/asr_worker.py
new file mode 100644
index 0000000..8e37d86
--- /dev/null
+++ b/static/scripts/vibevoice-asr/realtime/asr_worker.py
@@ -0,0 +1,358 @@
+"""
+ASR Worker for real-time transcription.
+
+Wraps the existing VibeVoiceASRInference for async/streaming operation.
+"""
+
+import sys
+import os
+import asyncio
+import threading
+import time
+import queue
+from typing import AsyncGenerator, Optional, List, Callable
+from dataclasses import dataclass
+import numpy as np
+import torch
+
+# Add parent directory and demo directory to path for importing existing code
+_parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, _parent_dir)
+sys.path.insert(0, os.path.join(_parent_dir, "demo"))
+
+from .models import (
+    TranscriptionResult,
+    TranscriptionSegment,
+    MessageType,
+    SessionConfig,
+)
+
+
+@dataclass
+class InferenceRequest:
+    """Request for ASR inference."""
+    audio: np.ndarray
+    sample_rate: int
+    context_info: Optional[str]
+    request_time: float
+    segment_start_sec: float
+    segment_end_sec: float
+
+
+class ASRWorker:
+    """
+    ASR Worker that wraps VibeVoiceASRInference for real-time use.
+
+    Features:
+    - Async interface for WebSocket integration
+    - Streaming output via TextIteratorStreamer
+    - Request queuing for handling concurrent segments
+    - Graceful model loading and error handling
+    """
+
+    def __init__(
+        self,
+        model_path: str = "microsoft/VibeVoice-ASR",
+        device: str = "cuda",
+        dtype: torch.dtype = torch.bfloat16,
+        attn_implementation: str = "flash_attention_2",
+    ):
+        """
+        Initialize the ASR worker.
+
+        Args:
+            model_path: Path to VibeVoice ASR model
+            device: Device to run inference on
+            dtype: Model data type
+            attn_implementation: Attention implementation
+        """
+        self.model_path = model_path
+        self.device = device
+        self.dtype = dtype
+        self.attn_implementation = attn_implementation
+
+        self._inference = None
+        self._is_loaded = False
+        self._load_lock = threading.Lock()
+
+        # Inference queue for serializing requests
+        self._inference_semaphore = asyncio.Semaphore(1)
+
+    def load_model(self) -> bool:
+        """
+        Load the ASR model.
+
+        Returns:
+            True if model loaded successfully
+        """
+        with self._load_lock:
+            if self._is_loaded:
+                return True
+
+            try:
+                # Import here to avoid circular imports and allow lazy loading
+                # In Docker, the file is copied as vibevoice_asr_gradio_demo.py
+                try:
+                    from vibevoice_asr_gradio_demo import VibeVoiceASRInference
+                except ImportError:
+                    from vibevoice_asr_gradio_demo_patched import VibeVoiceASRInference
+
+                print(f"Loading VibeVoice ASR model from {self.model_path}...")
+                self._inference = VibeVoiceASRInference(
+                    model_path=self.model_path,
+                    device=self.device,
+                    dtype=self.dtype,
+                    attn_implementation=self.attn_implementation,
+                )
+                self._is_loaded = True
+                print("ASR model loaded successfully")
+                return True
+
+            except Exception as e:
+                print(f"Failed to load ASR model: {e}")
+                import traceback
+                traceback.print_exc()
+                return False
+
+    @property
+    def is_loaded(self) -> bool:
+        """Check if model is loaded."""
+        return self._is_loaded
+
+    async def transcribe_segment(
+        self,
+        audio: np.ndarray,
+        sample_rate: int = 16000,
+        context_info: Optional[str] = None,
+        segment_start_sec: float = 0.0,
+        segment_end_sec: float = 0.0,
+        config: Optional[SessionConfig] = None,
+        on_partial: Optional[Callable[[TranscriptionResult], None]] = None,
+    ) -> TranscriptionResult:
+        """
+        Transcribe an audio segment asynchronously.
+
+        Args:
+            audio: Audio data as float32 array
+            sample_rate: Audio sample rate
+            context_info: Optional context for transcription
+            segment_start_sec: Start time of segment in session
+            segment_end_sec: End time of segment in session
+            config: Session configuration
+            on_partial: Callback for partial results
+
+        Returns:
+            Final transcription result
+        """
+        if not self._is_loaded:
+            if not self.load_model():
+                return TranscriptionResult(
+                    type=MessageType.ERROR,
+                    text="",
+                    is_final=True,
+                    latency_ms=0,
+                )
+
+        config = config or SessionConfig()
+        request_time = time.time()
+
+        # Serialize inference requests
+        async with self._inference_semaphore:
+            return await self._run_inference(
+                audio=audio,
+                sample_rate=sample_rate,
+                context_info=context_info,
+                segment_start_sec=segment_start_sec,
+                segment_end_sec=segment_end_sec,
+                config=config,
+                request_time=request_time,
+                on_partial=on_partial,
+            )
+
+    async def _run_inference(
+        self,
+        audio: np.ndarray,
+        sample_rate: int,
+        context_info: Optional[str],
+        segment_start_sec: float,
+        segment_end_sec: float,
+        config: SessionConfig,
+        request_time: float,
+        on_partial: Optional[Callable[[TranscriptionResult], None]],
+    ) -> TranscriptionResult:
+        """Run the actual inference in a thread pool."""
+        from transformers import TextIteratorStreamer
+
+        # Create streamer for partial results
+        streamer = None
+        if config.return_partial_results and on_partial:
+            streamer = TextIteratorStreamer(
+                self._inference.processor.tokenizer,
+                skip_prompt=True,
+                skip_special_tokens=True,
+            )
+
+        # Result container for thread
+        result_container = {"result": None, "error": None}
+
+        def run_inference():
+            try:
+                # Save audio to temp file (required by current implementation)
+                import tempfile
+                import soundfile as sf
+
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                    temp_path = f.name
+
+                # Write audio
+                audio_int16 = (audio * 32768.0).clip(-32768, 32767).astype(np.int16)
+                sf.write(temp_path, audio_int16, sample_rate, subtype='PCM_16')
+
+                try:
+                    result = self._inference.transcribe(
+                        audio_path=temp_path,
+                        max_new_tokens=config.max_new_tokens,
+                        temperature=config.temperature,
+                        context_info=context_info,
+                        streamer=streamer,
+                    )
+                    result_container["result"] = result
+                finally:
+                    # Clean up temp file
+                    try:
+                        os.unlink(temp_path)
+                    except:
+                        pass
+
+            except Exception as e:
+                result_container["error"] = str(e)
+                import traceback
+                traceback.print_exc()
+
+        # Start inference in background thread
+        inference_thread = threading.Thread(target=run_inference)
+        inference_thread.start()
+
+        # Stream partial results if enabled
+        partial_text = ""
+        if streamer and on_partial:
+            try:
+                for new_text in streamer:
+                    partial_text += new_text
+                    partial_result = TranscriptionResult(
+                        type=MessageType.PARTIAL_RESULT,
+                        text=partial_text,
+                        is_final=False,
+                        latency_ms=(time.time() - request_time) * 1000,
+                    )
+                    # Call callback (may be async)
+                    if asyncio.iscoroutinefunction(on_partial):
+                        await on_partial(partial_result)
+                    else:
+                        on_partial(partial_result)
+            except Exception as e:
+                print(f"Error during streaming: {e}")
+
+        # Wait for completion
+        inference_thread.join()
+
+        latency_ms = (time.time() - request_time) * 1000
+
+        if result_container["error"]:
+            return TranscriptionResult(
+                type=MessageType.ERROR,
+                text=f"Error: {result_container['error']}",
+                is_final=True,
+                latency_ms=latency_ms,
+            )
+
+        result = result_container["result"]
+
+        # Convert segments to our format
+        segments = []
+        for seg in result.get("segments", []):
+            # Adjust timestamps relative to session
+            seg_start = seg.get("start_time", 0)
+            seg_end = seg.get("end_time", 0)
+
+            # If segment has relative timestamps, adjust to absolute
+            if isinstance(seg_start, (int, float)) and isinstance(seg_end, (int, float)):
+                adjusted_start = segment_start_sec + seg_start
+                adjusted_end = segment_start_sec + seg_end
+            else:
+                adjusted_start = segment_start_sec
+                adjusted_end = segment_end_sec
+
+            segments.append(TranscriptionSegment(
+                start_time=adjusted_start,
+                end_time=adjusted_end,
+                speaker_id=seg.get("speaker_id", "SPEAKER_00"),
+                text=seg.get("text", ""),
+            ))
+
+        return TranscriptionResult(
+            type=MessageType.FINAL_RESULT,
+            text=result.get("raw_text", ""),
+            is_final=True,
+            segments=segments,
+            latency_ms=latency_ms,
+        )
+
+    async def transcribe_stream(
+        self,
+        audio: np.ndarray,
+        sample_rate: int = 16000,
+        context_info: Optional[str] = None,
+        segment_start_sec: float = 0.0,
+        segment_end_sec: float = 0.0,
+        config: Optional[SessionConfig] = None,
+    ) -> AsyncGenerator[TranscriptionResult, None]:
+        """
+        Transcribe an audio segment with streaming output.
+
+        Yields partial results followed by final result.
+
+        Args:
+            audio: Audio data
+            sample_rate: Sample rate
+            context_info: Optional context
+            segment_start_sec: Segment start time
+            segment_end_sec: Segment end time
+            config: Session config
+
+        Yields:
+            TranscriptionResult objects (partial and final)
+        """
+        result_queue: asyncio.Queue = asyncio.Queue()
+
+        async def on_partial(result: TranscriptionResult):
+            await result_queue.put(result)
+
+        # Start transcription task
+        transcribe_task = asyncio.create_task(
+            self.transcribe_segment(
+                audio=audio,
+                sample_rate=sample_rate,
+                context_info=context_info,
+                segment_start_sec=segment_start_sec,
+                segment_end_sec=segment_end_sec,
+                config=config,
+                on_partial=on_partial,
+            )
+        )
+
+        # Yield partial results as they come
+        while not transcribe_task.done():
+            try:
+                result = await asyncio.wait_for(result_queue.get(), timeout=0.1)
+                yield result
+            except asyncio.TimeoutError:
+                continue
+
+        # Drain any remaining partial results
+        while not result_queue.empty():
+            yield await result_queue.get()
+
+        # Yield final result
+        final_result = await transcribe_task
+        yield final_result
diff --git a/static/scripts/vibevoice-asr/realtime/audio_buffer.py b/static/scripts/vibevoice-asr/realtime/audio_buffer.py
new file mode 100644
index 0000000..855624f
--- /dev/null
+++ b/static/scripts/vibevoice-asr/realtime/audio_buffer.py
@@ -0,0 +1,246 @@
+"""
+Audio buffer management for real-time ASR.
+
+Implements a ring buffer for efficient audio chunk management with overlap support.
+"""
+
+import numpy as np
+from typing import Optional, Tuple
+from dataclasses import dataclass
+import threading
+
+
+@dataclass
+class AudioChunkInfo:
+    """Information about an extracted audio chunk."""
+    audio: np.ndarray
+    start_sample: int
+    end_sample: int
+    start_sec: float
+    end_sec: float
+
+
+class AudioBuffer:
+    """
+    Ring buffer for managing audio chunks with overlap support.
+
+    Features:
+    - Efficient memory management with fixed-size buffer
+    - Overlap handling for continuous processing
+    - Thread-safe operations
+    - Automatic sample rate tracking
+    """
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        chunk_duration_sec: float = 3.0,
+        overlap_sec: float = 0.5,
+        max_buffer_sec: float = 60.0,
+    ):
+        """
+        Initialize the audio buffer.
+
+        Args:
+            sample_rate: Audio sample rate in Hz
+            chunk_duration_sec: Duration of each processing chunk
+            overlap_sec: Overlap between consecutive chunks
+            max_buffer_sec: Maximum buffer duration (older data will be discarded)
+        """
+        self.sample_rate = sample_rate
+        self.chunk_size = int(chunk_duration_sec * sample_rate)
+        self.overlap_size = int(overlap_sec * sample_rate)
+        self.max_buffer_size = int(max_buffer_sec * sample_rate)
+
+        # Main buffer (pre-allocated)
+        self._buffer = np.zeros(self.max_buffer_size, dtype=np.float32)
+        self._write_pos = 0  # Next position to write
+        self._read_pos = 0   # Position of unprocessed data start
+        self._total_samples_received = 0  # Total samples since session start
+
+        self._lock = threading.Lock()
+
+    @property
+    def samples_available(self) -> int:
+        """Number of unprocessed samples in buffer."""
+        with self._lock:
+            return self._write_pos - self._read_pos
+
+    @property
+    def duration_available_sec(self) -> float:
+        """Duration of unprocessed audio in seconds."""
+        return self.samples_available / self.sample_rate
+
+    @property
+    def total_duration_sec(self) -> float:
+        """Total duration of audio received since session start."""
+        return self._total_samples_received / self.sample_rate
+
+    def append(self, audio_chunk: np.ndarray) -> int:
+        """
+        Append audio chunk to the buffer.
+
+        Args:
+            audio_chunk: Audio data as float32 array (range: -1.0 to 1.0)
+
+        Returns:
+            Number of samples actually appended
+        """
+        if audio_chunk.dtype != np.float32:
+            audio_chunk = audio_chunk.astype(np.float32)
+
+        # Ensure 1D
+        if audio_chunk.ndim > 1:
+            audio_chunk = audio_chunk.flatten()
+
+        with self._lock:
+            chunk_len = len(audio_chunk)
+
+            # Check if we need to shift buffer (running out of space)
+            if self._write_pos + chunk_len > self.max_buffer_size:
+                self._compact_buffer()
+
+            # Still not enough space? Discard old unprocessed data
+            if self._write_pos + chunk_len > self.max_buffer_size:
+                overflow = (self._write_pos + chunk_len) - self.max_buffer_size
+                self._read_pos = min(self._read_pos + overflow, self._write_pos)
+                self._compact_buffer()
+
+            # Write to buffer
+            end_pos = self._write_pos + chunk_len
+            self._buffer[self._write_pos:end_pos] = audio_chunk
+            self._write_pos = end_pos
+            self._total_samples_received += chunk_len
+
+            return chunk_len
+
+    def _compact_buffer(self) -> None:
+        """Move unprocessed data to the beginning of the buffer."""
+        if self._read_pos > 0:
+            unprocessed_len = self._write_pos - self._read_pos
+            if unprocessed_len > 0:
+                self._buffer[:unprocessed_len] = self._buffer[self._read_pos:self._write_pos]
+            self._write_pos = unprocessed_len
+            self._read_pos = 0
+
+    def get_chunk_for_inference(self, min_duration_sec: float = 0.5) -> Optional[AudioChunkInfo]:
+        """
+        Get the next chunk for ASR inference.
+
+        Returns a chunk of audio when enough data is available.
+        The chunk includes overlap from the previous chunk for context.
+
+        Args:
+            min_duration_sec: Minimum duration required to return a chunk
+
+        Returns:
+            AudioChunkInfo if enough data is available, None otherwise
+        """
+        min_samples = int(min_duration_sec * self.sample_rate)
+
+        with self._lock:
+            available = self._write_pos - self._read_pos
+
+            if available < min_samples:
+                return None
+
+            # Calculate chunk boundaries
+            chunk_start = self._read_pos
+            chunk_end = min(self._read_pos + self.chunk_size, self._write_pos)
+            actual_chunk_size = chunk_end - chunk_start
+
+            # Extract audio
+            audio = self._buffer[chunk_start:chunk_end].copy()
+
+            # Calculate timestamps based on total samples received
+            base_sample = self._total_samples_received - (self._write_pos - chunk_start)
+            start_sec = base_sample / self.sample_rate
+            end_sec = (base_sample + actual_chunk_size) / self.sample_rate
+
+            return AudioChunkInfo(
+                audio=audio,
+                start_sample=base_sample,
+                end_sample=base_sample + actual_chunk_size,
+                start_sec=start_sec,
+                end_sec=end_sec,
+            )
+
+    def mark_processed(self, samples: int) -> None:
+        """
+        Mark samples as processed, advancing the read position.
+
+        Keeps overlap_size samples for context in the next chunk.
+
+        Args:
+            samples: Number of samples that were processed
+        """
+        with self._lock:
+            # Advance read position but keep overlap for context
+            advance = max(0, samples - self.overlap_size)
+            self._read_pos = min(self._read_pos + advance, self._write_pos)
+
+    def get_segment(self, start_sec: float, end_sec: float) -> Optional[np.ndarray]:
+        """
+        Get a specific time segment from the buffer.
+
+        Args:
+            start_sec: Start time in seconds (relative to session start)
+            end_sec: End time in seconds
+
+        Returns:
+            Audio segment if available, None otherwise
+        """
+        start_sample = int(start_sec * self.sample_rate)
+        end_sample = int(end_sec * self.sample_rate)
+
+        with self._lock:
+            # Calculate buffer positions
+            buffer_start_sample = self._total_samples_received - self._write_pos
+            buffer_end_sample = self._total_samples_received
+
+            # Check if segment is in buffer
+            if start_sample < buffer_start_sample or end_sample > buffer_end_sample:
+                return None
+
+            # Convert to buffer indices
+            buf_start = start_sample - buffer_start_sample
+            buf_end = end_sample - buffer_start_sample
+
+            return self._buffer[buf_start:buf_end].copy()
+
+    def get_all_unprocessed(self) -> Optional[AudioChunkInfo]:
+        """
+        Get all unprocessed audio.
+
+        Returns:
+            AudioChunkInfo with all unprocessed audio, or None if empty
+        """
+        with self._lock:
+            if self._write_pos <= self._read_pos:
+                return None
+
+            audio = self._buffer[self._read_pos:self._write_pos].copy()
+            base_sample = self._total_samples_received - (self._write_pos - self._read_pos)
+            start_sec = base_sample / self.sample_rate
+            end_sec = self._total_samples_received / self.sample_rate
+
+            return AudioChunkInfo(
+                audio=audio,
+                start_sample=base_sample,
+                end_sample=self._total_samples_received,
+                start_sec=start_sec,
+                end_sec=end_sec,
+            )
+
+    def clear(self) -> None:
+        """Clear the buffer and reset all positions."""
+        with self._lock:
+            self._buffer.fill(0)
+            self._write_pos = 0
+            self._read_pos = 0
+            self._total_samples_received = 0
+
+    def reset_read_position(self) -> None:
+        """Reset read position to current write position (skip all unprocessed)."""
+        with self._lock:
+            self._read_pos = self._write_pos
diff --git a/static/scripts/vibevoice-asr/realtime/models.py b/static/scripts/vibevoice-asr/realtime/models.py
new file mode 100644
index 0000000..6846c87
--- /dev/null
+++ b/static/scripts/vibevoice-asr/realtime/models.py
@@ -0,0 +1,154 @@
+"""
+Data models for real-time ASR WebSocket communication.
+"""
+
+from enum import Enum
+from typing import Optional, List, Dict, Any
+from dataclasses import dataclass, field, asdict
+import time
+
+
+class MessageType(str, Enum):
+    """WebSocket message types."""
+    # Client -> Server
+    AUDIO_CHUNK = "audio_chunk"
+    CONFIG = "config"
+    START = "start"
+    STOP = "stop"
+
+    # Server -> Client
+    PARTIAL_RESULT = "partial_result"
+    FINAL_RESULT = "final_result"
+    VAD_EVENT = "vad_event"
+    ERROR = "error"
+    STATUS = "status"
+
+
+class VADEventType(str, Enum):
+    """VAD event types."""
+    SPEECH_START = "speech_start"
+    SPEECH_END = "speech_end"
+
+
+@dataclass
+class SessionConfig:
+    """Configuration for a real-time ASR session."""
+    # Audio parameters
+    sample_rate: int = 16000
+    chunk_duration_sec: float = 3.0
+    overlap_sec: float = 0.5
+
+    # VAD parameters
+    vad_threshold: float = 0.5
+    min_speech_duration_ms: int = 250
+    min_silence_duration_ms: int = 500
+    min_volume_threshold: float = 0.01  # Minimum RMS volume (0.0-1.0) to consider as potential speech
+
+    # ASR parameters
+    max_new_tokens: int = 512
+    temperature: float = 0.0
+    context_info: Optional[str] = None
+
+    # Behavior
+    return_partial_results: bool = True
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "SessionConfig":
+        return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
+
+
+@dataclass
+class TranscriptionSegment:
+    """A single transcription segment with metadata."""
+    start_time: float
+    end_time: float
+    speaker_id: str
+    text: str
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class TranscriptionResult:
+    """Transcription result message."""
+    type: MessageType
+    text: str
+    is_final: bool
+    segments: List[TranscriptionSegment] = field(default_factory=list)
+    latency_ms: float = 0.0
+    timestamp: float = field(default_factory=time.time)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "type": self.type.value,
+            "text": self.text,
+            "is_final": self.is_final,
+            "segments": [s.to_dict() for s in self.segments],
+            "latency_ms": self.latency_ms,
+            "timestamp": self.timestamp,
+        }
+
+
+@dataclass
+class VADEvent:
+    """VAD event message."""
+    type: MessageType = MessageType.VAD_EVENT
+    event: VADEventType = VADEventType.SPEECH_START
+    timestamp: float = field(default_factory=time.time)
+    audio_timestamp_sec: float = 0.0
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "type": self.type.value,
+            "event": self.event.value,
+            "timestamp": self.timestamp,
+            "audio_timestamp_sec": self.audio_timestamp_sec,
+        }
+
+
+@dataclass
+class StatusMessage:
+    """Status message."""
+    type: MessageType = MessageType.STATUS
+    status: str = ""
+    message: str = ""
+    timestamp: float = field(default_factory=time.time)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "type": self.type.value,
+            "status": self.status,
+            "message": self.message,
+            "timestamp": self.timestamp,
+        }
+
+
+@dataclass
+class ErrorMessage:
+    """Error message."""
+    type: MessageType = MessageType.ERROR
+    error: str = ""
+    code: str = ""
+    timestamp: float = field(default_factory=time.time)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "type": self.type.value,
+            "error": self.error,
+            "code": self.code,
+            "timestamp": self.timestamp,
+        }
+
+
+@dataclass
+class SpeechSegment:
+    """Detected speech segment from VAD."""
+    start_sample: int
+    end_sample: int
+    start_sec: float
+    end_sec: float
+    confidence: float = 1.0
diff --git a/static/scripts/vibevoice-asr/realtime/server.py b/static/scripts/vibevoice-asr/realtime/server.py
new file mode 100644
index 0000000..615490b
--- /dev/null
+++ b/static/scripts/vibevoice-asr/realtime/server.py
@@ -0,0 +1,300 @@
+"""
+FastAPI WebSocket server for real-time ASR.
+
+Provides WebSocket endpoint for streaming audio and receiving transcriptions.
+"""
+
+import os
+import sys
+import asyncio
+import json
+import time
+from typing import Optional
+from contextlib import asynccontextmanager
+
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import HTMLResponse, JSONResponse
+import uvicorn
+
+# Add parent directory to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from .models import (
+    SessionConfig,
+    TranscriptionResult,
+    VADEvent,
+    StatusMessage,
+    ErrorMessage,
+    MessageType,
+)
+from .asr_worker import ASRWorker
+from .session_manager import SessionManager
+
+
+# Global instances
+asr_worker: Optional[ASRWorker] = None
+session_manager: Optional[SessionManager] = None
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan manager."""
+    global asr_worker, session_manager
+
+    # Startup
+    print("Starting VibeVoice Realtime ASR Server...")
+
+    # Get model path from environment or use default
+    model_path = os.environ.get("VIBEVOICE_MODEL_PATH", "microsoft/VibeVoice-ASR")
+    device = os.environ.get("VIBEVOICE_DEVICE", "cuda")
+    attn_impl = os.environ.get("VIBEVOICE_ATTN_IMPL", "flash_attention_2")
+
+    # Initialize ASR worker
+    asr_worker = ASRWorker(
+        model_path=model_path,
+        device=device,
+        attn_implementation=attn_impl,
+    )
+
+    # Pre-load model (optional, can be lazy-loaded on first request)
+    preload = os.environ.get("VIBEVOICE_PRELOAD_MODEL", "true").lower() == "true"
+    if preload:
+        print("Pre-loading ASR model...")
+        asr_worker.load_model()
+
+    # Initialize session manager
+    max_sessions = int(os.environ.get("VIBEVOICE_MAX_SESSIONS", "10"))
+    session_manager = SessionManager(
+        asr_worker=asr_worker,
+        max_concurrent_sessions=max_sessions,
+    )
+    await session_manager.start()
+
+    print("Server ready!")
+
+    yield
+
+    # Shutdown
+    print("Shutting down...")
+    await session_manager.stop()
+
+
+# Create FastAPI app
+app = FastAPI(
+    title="VibeVoice Realtime ASR",
+    description="Real-time speech recognition using VibeVoice ASR",
+    version="0.1.0",
+    lifespan=lifespan,
+)
+
+# Mount static files
+static_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "static")
+if os.path.exists(static_dir):
+    app.mount("/static", StaticFiles(directory=static_dir), name="static")
+
+
+@app.get("/")
+async def root():
+    """Root endpoint with API info."""
+    return {
+        "service": "VibeVoice Realtime ASR",
+        "version": "0.1.0",
+        "endpoints": {
+            "websocket": "/ws/asr/{session_id}",
+            "health": "/health",
+            "stats": "/stats",
+            "client": "/static/realtime_client.html",
+        },
+    }
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {
+        "status": "healthy",
+        "model_loaded": asr_worker.is_loaded if asr_worker else False,
+        "active_sessions": len(session_manager._sessions) if session_manager else 0,
+    }
+
+
+@app.get("/stats")
+async def get_stats():
+    """Get server statistics."""
+    if session_manager is None:
+        raise HTTPException(status_code=503, detail="Server not initialized")
+
+    return session_manager.get_stats()
+
+
+@app.websocket("/ws/asr/{session_id}")
+async def websocket_asr(websocket: WebSocket, session_id: str):
+    """
+    WebSocket endpoint for real-time ASR.
+
+    Protocol:
+    1. Client connects and optionally sends config message
+    2. Client sends binary audio chunks (PCM 16-bit, 16kHz, mono)
+    3. Server sends JSON messages with transcription results
+
+    Message types (server -> client):
+    - partial_result: Intermediate transcription
+    - final_result: Complete transcription for a segment
+    - vad_event: Speech start/end events
+    - error: Error messages
+    - status: Status updates
+    """
+    await websocket.accept()
+
+    # Send connection confirmation
+    await websocket.send_json(
+        StatusMessage(
+            status="connected",
+            message=f"Session {session_id} connected",
+        ).to_dict()
+    )
+
+    # Result callback
+    async def on_result(result: TranscriptionResult):
+        try:
+            await websocket.send_json(result.to_dict())
+        except Exception as e:
+            print(f"[{session_id}] Failed to send result: {e}")
+
+    # VAD event callback
+    async def on_vad_event(event: VADEvent):
+        try:
+            await websocket.send_json(event.to_dict())
+        except Exception as e:
+            print(f"[{session_id}] Failed to send VAD event: {e}")
+
+    # Create session
+    session = await session_manager.create_session(
+        session_id=session_id,
+        on_result=on_result,
+        on_vad_event=on_vad_event,
+    )
+
+    if session is None:
+        await websocket.send_json(
+            ErrorMessage(
+                error="Maximum sessions reached",
+                code="MAX_SESSIONS",
+            ).to_dict()
+        )
+        await websocket.close()
+        return
+
+    await websocket.send_json(
+        StatusMessage(
+            status="ready",
+            message="Session ready for audio",
+        ).to_dict()
+    )
+
+    try:
+        while True:
+            # Receive message
+            message = await websocket.receive()
+
+            if message["type"] == "websocket.disconnect":
+                break
+
+            # Handle binary audio data
+            if "bytes" in message:
+                audio_data = message["bytes"]
+                try:
+                    await session.process_audio_chunk(audio_data)
+                except Exception as e:
+                    print(f"[{session_id}] Error processing audio: {e}")
+                    import traceback
+                    traceback.print_exc()
+
+            # Handle JSON control messages
+            elif "text" in message:
+                try:
+                    data = json.loads(message["text"])
+                    msg_type = data.get("type")
+
+                    if msg_type == "config":
+                        # Update session config
+                        config = SessionConfig.from_dict(data.get("config", {}))
+                        session.update_config(config)
+                        await websocket.send_json(
+                            StatusMessage(
+                                status="config_updated",
+                                message="Configuration updated",
+                            ).to_dict()
+                        )
+
+                    elif msg_type == "stop":
+                        # Flush and close
+                        await session.flush()
+                        await websocket.send_json(
+                            StatusMessage(
+                                status="stopped",
+                                message="Session stopped",
+                            ).to_dict()
+                        )
+                        break
+
+                    elif msg_type == "ping":
+                        await websocket.send_json({"type": "pong", "timestamp": time.time()})
+
+                except json.JSONDecodeError:
+                    await websocket.send_json(
+                        ErrorMessage(
+                            error="Invalid JSON",
+                            code="INVALID_JSON",
+                        ).to_dict()
+                    )
+
+    except WebSocketDisconnect:
+        print(f"[{session_id}] Client disconnected")
+    except Exception as e:
+        print(f"[{session_id}] Error: {e}")
+        import traceback
+        traceback.print_exc()
+    finally:
+        # Clean up session
+        await session_manager.close_session(session_id)
+        print(f"[{session_id}] Session closed")
+
+
+def main():
+    """Main entry point."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="VibeVoice Realtime ASR Server")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
+    parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
+    parser.add_argument("--model-path", type=str, default="microsoft/VibeVoice-ASR",
+                        help="Path to VibeVoice ASR model")
+    parser.add_argument("--device", type=str, default="cuda", help="Device (cuda/cpu)")
+    parser.add_argument("--max-sessions", type=int, default=10, help="Max concurrent sessions")
+    parser.add_argument("--no-preload", action="store_true", help="Don't preload model")
+
+    args = parser.parse_args()
+
+    # Set environment variables for lifespan
+    os.environ["VIBEVOICE_MODEL_PATH"] = args.model_path
+    os.environ["VIBEVOICE_DEVICE"] = args.device
+    os.environ["VIBEVOICE_MAX_SESSIONS"] = str(args.max_sessions)
+    os.environ["VIBEVOICE_PRELOAD_MODEL"] = "false" if args.no_preload else "true"
+
+    print(f"Starting server on {args.host}:{args.port}")
+    print(f"Model: {args.model_path}")
+    print(f"Device: {args.device}")
+    print(f"Max sessions: {args.max_sessions}")
+
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="info",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/static/scripts/vibevoice-asr/realtime/session_manager.py b/static/scripts/vibevoice-asr/realtime/session_manager.py
new file mode 100644
index 0000000..7aaeaff
--- /dev/null
+++ b/static/scripts/vibevoice-asr/realtime/session_manager.py
@@ -0,0 +1,401 @@
+"""
+Session manager for real-time ASR.
+
+Manages multiple concurrent client sessions with resource isolation.
+"""
+
+import asyncio
+import time
+import uuid
+from typing import Dict, Optional, Callable, Any
+from dataclasses import dataclass, field
+import threading
+
+from .models import (
+    SessionConfig,
+    TranscriptionResult,
+    VADEvent,
+    StatusMessage,
+    ErrorMessage,
+    MessageType,
+    SpeechSegment,
+)
+from .audio_buffer import AudioBuffer
+from .vad_processor import VADProcessor
+from .asr_worker import ASRWorker
+
+
+@dataclass
+class SessionStats:
+    """Statistics for a session."""
+    created_at: float = field(default_factory=time.time)
+    last_activity: float = field(default_factory=time.time)
+    audio_received_sec: float = 0.0
+    chunks_received: int = 0
+    segments_transcribed: int = 0
+    total_latency_ms: float = 0.0
+
+
+class RealtimeSession:
+    """
+    A single real-time ASR session.
+
+    Manages audio buffering, VAD, and ASR for one client connection.
+    """
+
+    def __init__(
+        self,
+        session_id: str,
+        asr_worker: ASRWorker,
+        config: Optional[SessionConfig] = None,
+        on_result: Optional[Callable[[TranscriptionResult], Any]] = None,
+        on_vad_event: Optional[Callable[[VADEvent], Any]] = None,
+    ):
+        """
+        Initialize a session.
+
+        Args:
+            session_id: Unique session identifier
+            asr_worker: Shared ASR worker instance
+            config: Session configuration
+            on_result: Callback for transcription results
+            on_vad_event: Callback for VAD events
+        """
+        self.session_id = session_id
+        self.asr_worker = asr_worker
+        self.config = config or SessionConfig()
+        self.on_result = on_result
+        self.on_vad_event = on_vad_event
+
+        # Components
+        self.audio_buffer = AudioBuffer(
+            sample_rate=self.config.sample_rate,
+            chunk_duration_sec=self.config.chunk_duration_sec,
+            overlap_sec=self.config.overlap_sec,
+        )
+
+        self.vad_processor = VADProcessor(
+            sample_rate=self.config.sample_rate,
+            threshold=self.config.vad_threshold,
+            min_speech_duration_ms=self.config.min_speech_duration_ms,
+            min_silence_duration_ms=self.config.min_silence_duration_ms,
+            min_volume_threshold=self.config.min_volume_threshold,
+        )
+
+        # State
+        self.is_active = True
+        self.stats = SessionStats()
+        self._processing_lock = asyncio.Lock()
+        self._pending_tasks: list = []
+
+    async def process_audio_chunk(self, audio_data: bytes) -> None:
+        """
+        Process an incoming audio chunk.
+
+        Args:
+            audio_data: Raw PCM audio data (16-bit, 16kHz, mono)
+        """
+        if not self.is_active:
+            return
+
+        self.stats.last_activity = time.time()
+        self.stats.chunks_received += 1
+
+        # Convert bytes to float32 array
+        import numpy as np
+        audio_int16 = np.frombuffer(audio_data, dtype=np.int16)
+        audio_float = audio_int16.astype(np.float32) / 32768.0
+
+        self.stats.audio_received_sec += len(audio_float) / self.config.sample_rate
+
+        # Add to buffer
+        self.audio_buffer.append(audio_float)
+
+        # Process with VAD
+        segments, events = self.vad_processor.process(audio_float)
+
+        # Send VAD events
+        if self.on_vad_event:
+            for event in events:
+                await self._send_callback(self.on_vad_event, event)
+
+        # Process completed speech segments
+        for segment in segments:
+            try:
+                await self._transcribe_segment(segment)
+            except Exception as e:
+                print(f"[Session {self.session_id}] Transcription error: {e}")
+                import traceback
+                traceback.print_exc()
+
+    async def _transcribe_segment(self, segment: SpeechSegment) -> None:
+        """Transcribe a detected speech segment."""
+        # Get audio for segment from buffer
+        audio = self.audio_buffer.get_segment(segment.start_sec, segment.end_sec)
+
+        if audio is None or len(audio) == 0:
+            print(f"[Session {self.session_id}] Could not retrieve audio for segment")
+            return
+
+        self.stats.segments_transcribed += 1
+
+        async def on_partial(result: TranscriptionResult):
+            if self.on_result:
+                await self._send_callback(self.on_result, result)
+
+        # Run transcription
+        result = await self.asr_worker.transcribe_segment(
+            audio=audio,
+            sample_rate=self.config.sample_rate,
+            context_info=self.config.context_info,
+            segment_start_sec=segment.start_sec,
+            segment_end_sec=segment.end_sec,
+            config=self.config,
+            on_partial=on_partial if self.config.return_partial_results else None,
+        )
+
+        self.stats.total_latency_ms += result.latency_ms
+
+        # Send final result
+        if self.on_result:
+            await self._send_callback(self.on_result, result)
+
+    async def _send_callback(self, callback: Callable, data: Any) -> None:
+        """Send data via callback, handling both sync and async."""
+        try:
+            if asyncio.iscoroutinefunction(callback):
+                await callback(data)
+            else:
+                callback(data)
+        except Exception as e:
+            print(f"[Session {self.session_id}] Callback error: {e}")
+
+    async def flush(self) -> None:
+        """
+        Flush any remaining audio and force transcription.
+
+        Called when session ends to process any remaining speech.
+        """
+        # Force end any active speech
+        segment = self.vad_processor.force_end_speech()
+        if segment:
+            await self._transcribe_segment(segment)
+
+        # Also check for any unprocessed audio in buffer
+        chunk_info = self.audio_buffer.get_all_unprocessed()
+        if chunk_info and len(chunk_info.audio) > self.config.sample_rate * 0.5:
+            # More than 0.5 seconds of unprocessed audio
+            forced_segment = SpeechSegment(
+                start_sample=chunk_info.start_sample,
+                end_sample=chunk_info.end_sample,
+                start_sec=chunk_info.start_sec,
+                end_sec=chunk_info.end_sec,
+            )
+            await self._transcribe_segment(forced_segment)
+
+    def update_config(self, new_config: SessionConfig) -> None:
+        """Update session configuration (partial update supported)."""
+        # Merge with existing config - only update non-default values
+        if new_config.vad_threshold != 0.5:
+            self.config.vad_threshold = new_config.vad_threshold
+        if new_config.min_speech_duration_ms != 250:
+            self.config.min_speech_duration_ms = new_config.min_speech_duration_ms
+        if new_config.min_silence_duration_ms != 500:
+            self.config.min_silence_duration_ms = new_config.min_silence_duration_ms
+        if new_config.min_volume_threshold != 0.01:
+            self.config.min_volume_threshold = new_config.min_volume_threshold
+        if new_config.context_info is not None:
+            self.config.context_info = new_config.context_info
+
+        # Recreate VAD processor with new parameters
+        self.vad_processor = VADProcessor(
+            sample_rate=self.config.sample_rate,
+            threshold=self.config.vad_threshold,
+            min_speech_duration_ms=self.config.min_speech_duration_ms,
+            min_silence_duration_ms=self.config.min_silence_duration_ms,
+            min_volume_threshold=self.config.min_volume_threshold,
+        )
+        print(f"[Session {self.session_id}] Config updated: vad_threshold={self.config.vad_threshold}, "
+              f"min_speech={self.config.min_speech_duration_ms}ms, min_silence={self.config.min_silence_duration_ms}ms, "
+              f"min_volume={self.config.min_volume_threshold}")
+
+    def close(self) -> None:
+        """Close the session and release resources."""
+        self.is_active = False
+        self.audio_buffer.clear()
+        self.vad_processor.reset()
+
+    def get_stats(self) -> Dict:
+        """Get session statistics."""
+        return {
+            "session_id": self.session_id,
+            "created_at": self.stats.created_at,
+            "last_activity": self.stats.last_activity,
+            "duration_sec": time.time() - self.stats.created_at,
+            "audio_received_sec": self.stats.audio_received_sec,
+            "chunks_received": self.stats.chunks_received,
+            "segments_transcribed": self.stats.segments_transcribed,
+            "avg_latency_ms": (
+                self.stats.total_latency_ms / self.stats.segments_transcribed
+                if self.stats.segments_transcribed > 0 else 0
+            ),
+            "is_active": self.is_active,
+            "vad_speech_active": self.vad_processor.is_speech_active,
+        }
+
+
+class SessionManager:
+    """
+    Manages multiple concurrent ASR sessions.
+
+    Features:
+    - Session creation and cleanup
+    - Resource limiting (max concurrent sessions)
+    - Idle session timeout
+    - Shared ASR worker management
+    """
+
+    def __init__(
+        self,
+        asr_worker: ASRWorker,
+        max_concurrent_sessions: int = 10,
+        session_timeout_sec: float = 300.0,
+    ):
+        """
+        Initialize the session manager.
+
+        Args:
+            asr_worker: Shared ASR worker
+            max_concurrent_sessions: Maximum number of concurrent sessions
+            session_timeout_sec: Timeout for idle sessions
+        """
+        self.asr_worker = asr_worker
+        self.max_sessions = max_concurrent_sessions
+        self.session_timeout = session_timeout_sec
+
+        self._sessions: Dict[str, RealtimeSession] = {}
+        self._lock = asyncio.Lock()
+
+        # Cleanup task
+        self._cleanup_task: Optional[asyncio.Task] = None
+
+    async def start(self) -> None:
+        """Start the session manager."""
+        self._cleanup_task = asyncio.create_task(self._cleanup_loop())
+
+    async def stop(self) -> None:
+        """Stop the session manager and close all sessions."""
+        if self._cleanup_task:
+            self._cleanup_task.cancel()
+            try:
+                await self._cleanup_task
+            except asyncio.CancelledError:
+                pass
+
+        async with self._lock:
+            for session in self._sessions.values():
+                session.close()
+            self._sessions.clear()
+
+    async def create_session(
+        self,
+        session_id: Optional[str] = None,
+        config: Optional[SessionConfig] = None,
+        on_result: Optional[Callable[[TranscriptionResult], Any]] = None,
+        on_vad_event: Optional[Callable[[VADEvent], Any]] = None,
+    ) -> Optional[RealtimeSession]:
+        """
+        Create a new session.
+
+        Args:
+            session_id: Optional session ID (generated if not provided)
+            config: Session configuration
+            on_result: Callback for results
+            on_vad_event: Callback for VAD events
+
+        Returns:
+            Created session, or None if limit reached
+        """
+        async with self._lock:
+            # Check session limit
+            if len(self._sessions) >= self.max_sessions:
+                return None
+
+            # Generate session ID if not provided
+            if session_id is None:
+                session_id = str(uuid.uuid4())[:8]
+
+            # Check for duplicate
+            if session_id in self._sessions:
+                return self._sessions[session_id]
+
+            # Create session
+            session = RealtimeSession(
+                session_id=session_id,
+                asr_worker=self.asr_worker,
+                config=config,
+                on_result=on_result,
+                on_vad_event=on_vad_event,
+            )
+
+            self._sessions[session_id] = session
+            return session
+
+    async def get_session(self, session_id: str) -> Optional[RealtimeSession]:
+        """Get a session by ID."""
+        async with self._lock:
+            return self._sessions.get(session_id)
+
+    async def close_session(self, session_id: str) -> bool:
+        """
+        Close and remove a session.
+
+        Args:
+            session_id: Session to close
+
+        Returns:
+            True if session was found and closed
+        """
+        async with self._lock:
+            session = self._sessions.pop(session_id, None)
+            if session:
+                await session.flush()
+                session.close()
+                return True
+            return False
+
+    async def _cleanup_loop(self) -> None:
+        """Background task to clean up idle sessions."""
+        while True:
+            try:
+                await asyncio.sleep(60)  # Check every minute
+
+                current_time = time.time()
+                sessions_to_close = []
+
+                async with self._lock:
+                    for session_id, session in self._sessions.items():
+                        idle_time = current_time - session.stats.last_activity
+                        if idle_time > self.session_timeout:
+                            sessions_to_close.append(session_id)
+
+                for session_id in sessions_to_close:
+                    print(f"Closing idle session: {session_id}")
+                    await self.close_session(session_id)
+
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                print(f"Cleanup error: {e}")
+
+    def get_stats(self) -> Dict:
+        """Get manager statistics."""
+        return {
+            "active_sessions": len(self._sessions),
+            "max_sessions": self.max_sessions,
+            "session_timeout_sec": self.session_timeout,
+            "sessions": {
+                sid: session.get_stats()
+                for sid, session in self._sessions.items()
+            },
+        }
diff --git a/static/scripts/vibevoice-asr/realtime/vad_processor.py b/static/scripts/vibevoice-asr/realtime/vad_processor.py
new file mode 100644
index 0000000..62fbcf2
--- /dev/null
+++ b/static/scripts/vibevoice-asr/realtime/vad_processor.py
@@ -0,0 +1,295 @@
+"""
+Voice Activity Detection (VAD) processor using Silero-VAD (ONNX version).
+
+Detects speech segments in real-time audio streams.
+Uses ONNX runtime to avoid torchaudio dependency issues.
+"""
+
+import numpy as np
+from typing import List, Optional, Tuple
+from dataclasses import dataclass
+import threading
+import os
+import urllib.request
+
+from .models import SpeechSegment, VADEvent, VADEventType, MessageType
+
+
+@dataclass
+class VADState:
+    """Internal state of the VAD processor."""
+    is_speech_active: bool = False
+    speech_start_sample: int = 0
+    silence_start_sample: int = 0
+    last_speech_prob: float = 0.0
+    total_samples_processed: int = 0
+
+
+class VADProcessor:
+    """
+    Voice Activity Detection using Silero-VAD (ONNX version).
+
+    Features:
+    - Real-time speech detection
+    - Configurable thresholds for speech/silence duration
+    - Event generation for speech start/end
+    - Thread-safe operations
+    - No torchaudio dependency (uses ONNX runtime)
+    """
+
+    # Silero VAD ONNX model URL
+    ONNX_MODEL_URL = "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        threshold: float = 0.5,
+        min_speech_duration_ms: int = 250,
+        min_silence_duration_ms: int = 500,
+        window_size_samples: int = 512,
+        min_volume_threshold: float = 0.01,
+    ):
+        """
+        Initialize the VAD processor.
+
+        Args:
+            sample_rate: Audio sample rate (must be 16000 for Silero-VAD)
+            threshold: Speech probability threshold (0.0-1.0)
+            min_speech_duration_ms: Minimum speech duration to trigger speech_start
+            min_silence_duration_ms: Minimum silence duration to trigger speech_end
+            window_size_samples: VAD window size (512 for 16kHz = 32ms)
+            min_volume_threshold: Minimum RMS volume (0.0-1.0) to consider as potential speech
+        """
+        if sample_rate != 16000:
+            raise ValueError("Silero-VAD requires 16kHz sample rate")
+
+        self.sample_rate = sample_rate
+        self.threshold = threshold
+        self.min_speech_samples = int(min_speech_duration_ms * sample_rate / 1000)
+        self.min_silence_samples = int(min_silence_duration_ms * sample_rate / 1000)
+        self.window_size = window_size_samples
+        self.min_volume_threshold = min_volume_threshold
+
+        # Load ONNX model
+        self._session = None
+        self._load_model()
+
+        # ONNX model state - single state tensor (size depends on model version)
+        # Silero VAD v5 uses a single 'state' tensor of shape (2, 1, 128)
+        self._state_tensor = np.zeros((2, 1, 128), dtype=np.float32)
+
+        # State
+        self._state = VADState()
+        self._lock = threading.Lock()
+
+        # Pending speech segment (being accumulated)
+        self._pending_segment_start: Optional[int] = None
+
+    def _get_model_path(self) -> str:
+        """Get path to ONNX model, downloading if necessary."""
+        cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "silero-vad")
+        os.makedirs(cache_dir, exist_ok=True)
+        model_path = os.path.join(cache_dir, "silero_vad.onnx")
+
+        if not os.path.exists(model_path):
+            print(f"Downloading Silero-VAD ONNX model to {model_path}...")
+            urllib.request.urlretrieve(self.ONNX_MODEL_URL, model_path)
+            print("Download complete.")
+
+        return model_path
+
+    def _load_model(self) -> None:
+        """Load Silero-VAD ONNX model."""
+        try:
+            import onnxruntime as ort
+
+            model_path = self._get_model_path()
+            self._session = ort.InferenceSession(
+                model_path,
+                providers=['CPUExecutionProvider']
+            )
+            print(f"Silero-VAD ONNX model loaded from {model_path}")
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to load Silero-VAD ONNX model: {e}")
+
+    def _run_inference(self, audio_window: np.ndarray) -> float:
+        """Run VAD inference on a single window."""
+        # Prepare input
+        audio_input = audio_window.reshape(1, -1).astype(np.float32)
+        sr_input = np.array([self.sample_rate], dtype=np.int64)
+
+        # Run inference - Silero VAD v5 uses 'state' instead of 'h'/'c'
+        outputs = self._session.run(
+            ['output', 'stateN'],
+            {
+                'input': audio_input,
+                'sr': sr_input,
+                'state': self._state_tensor,
+            }
+        )
+
+        # Update state
+        speech_prob = outputs[0][0][0]
+        self._state_tensor = outputs[1]
+
+        return float(speech_prob)
+
+    def reset(self) -> None:
+        """Reset VAD state for a new session."""
+        with self._lock:
+            self._state = VADState()
+            self._pending_segment_start = None
+            # Reset state tensor
+            self._state_tensor = np.zeros((2, 1, 128), dtype=np.float32)
+
+    def process(
+        self,
+        audio_chunk: np.ndarray,
+        return_events: bool = True,
+    ) -> Tuple[List[SpeechSegment], List[VADEvent]]:
+        """
+        Process an audio chunk and detect speech segments.
+
+        Args:
+            audio_chunk: Audio data as float32 array
+            return_events: Whether to return VAD events
+
+        Returns:
+            Tuple of (completed_segments, events)
+        """
+        if audio_chunk.dtype != np.float32:
+            audio_chunk = audio_chunk.astype(np.float32)
+
+        completed_segments: List[SpeechSegment] = []
+        events: List[VADEvent] = []
+
+        with self._lock:
+            # Process in windows
+            chunk_start_sample = self._state.total_samples_processed
+            num_windows = len(audio_chunk) // self.window_size
+
+            for i in range(num_windows):
+                window_start = i * self.window_size
+                window_end = window_start + self.window_size
+                window = audio_chunk[window_start:window_end]
+
+                # Check volume (RMS) threshold first
+                rms = np.sqrt(np.mean(window ** 2))
+                if rms < self.min_volume_threshold:
+                    # Volume too low, treat as silence
+                    speech_prob = 0.0
+                else:
+                    # Get speech probability from VAD model
+                    speech_prob = self._run_inference(window)
+
+                self._state.last_speech_prob = speech_prob
+
+                current_sample = chunk_start_sample + window_end
+                is_speech = speech_prob >= self.threshold
+
+                # State machine for speech detection
+                if is_speech:
+                    if not self._state.is_speech_active:
+                        # Potential speech start
+                        if self._pending_segment_start is None:
+                            self._pending_segment_start = current_sample - self.window_size
+
+                        # Check if speech duration exceeds minimum
+                        speech_duration = current_sample - self._pending_segment_start
+                        if speech_duration >= self.min_speech_samples:
+                            self._state.is_speech_active = True
+                            self._state.speech_start_sample = self._pending_segment_start
+
+                            if return_events:
+                                events.append(VADEvent(
+                                    type=MessageType.VAD_EVENT,
+                                    event=VADEventType.SPEECH_START,
+                                    audio_timestamp_sec=self._pending_segment_start / self.sample_rate,
+                                ))
+                    else:
+                        # Continue speech, reset silence counter
+                        self._state.silence_start_sample = 0
+                else:
+                    if self._state.is_speech_active:
+                        # Potential speech end
+                        if self._state.silence_start_sample == 0:
+                            self._state.silence_start_sample = current_sample
+
+                        # Check if silence duration exceeds minimum
+                        silence_duration = current_sample - self._state.silence_start_sample
+                        if silence_duration >= self.min_silence_samples:
+                            # Speech ended - create completed segment
+                            segment = SpeechSegment(
+                                start_sample=self._state.speech_start_sample,
+                                end_sample=self._state.silence_start_sample,
+                                start_sec=self._state.speech_start_sample / self.sample_rate,
+                                end_sec=self._state.silence_start_sample / self.sample_rate,
+                            )
+                            completed_segments.append(segment)
+
+                            if return_events:
+                                events.append(VADEvent(
+                                    type=MessageType.VAD_EVENT,
+                                    event=VADEventType.SPEECH_END,
+                                    audio_timestamp_sec=self._state.silence_start_sample / self.sample_rate,
+                                ))
+
+                            # Reset state
+                            self._state.is_speech_active = False
+                            self._state.speech_start_sample = 0
+                            self._state.silence_start_sample = 0
+                            self._pending_segment_start = None
+                    else:
+                        # No speech, reset pending
+                        self._pending_segment_start = None
+
+            # Update total samples processed
+            self._state.total_samples_processed += len(audio_chunk)
+
+        return completed_segments, events
+
+    def force_end_speech(self) -> Optional[SpeechSegment]:
+        """
+        Force end of current speech segment (e.g., when session ends).
+
+        Returns:
+            Completed speech segment if speech was active, None otherwise
+        """
+        with self._lock:
+            if self._state.is_speech_active:
+                segment = SpeechSegment(
+                    start_sample=self._state.speech_start_sample,
+                    end_sample=self._state.total_samples_processed,
+                    start_sec=self._state.speech_start_sample / self.sample_rate,
+                    end_sec=self._state.total_samples_processed / self.sample_rate,
+                )
+
+                self._state.is_speech_active = False
+                self._state.speech_start_sample = 0
+                self._state.silence_start_sample = 0
+                self._pending_segment_start = None
+
+                return segment
+
+            return None
+
+    @property
+    def is_speech_active(self) -> bool:
+        """Check if speech is currently active."""
+        with self._lock:
+            return self._state.is_speech_active
+
+    @property
+    def last_speech_probability(self) -> float:
+        """Get the last computed speech probability."""
+        with self._lock:
+            return self._state.last_speech_prob
+
+    @property
+    def current_speech_duration_sec(self) -> float:
+        """Get duration of current speech segment (if active)."""
+        with self._lock:
+            if not self._state.is_speech_active:
+                return 0.0
+            return (self._state.total_samples_processed - self._state.speech_start_sample) / self.sample_rate
diff --git a/static/scripts/vibevoice-asr/requirements-realtime.txt b/static/scripts/vibevoice-asr/requirements-realtime.txt
new file mode 100644
index 0000000..c75b3e3
--- /dev/null
+++ b/static/scripts/vibevoice-asr/requirements-realtime.txt
@@ -0,0 +1,7 @@
+# Real-time ASR dependencies
+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+websockets>=11.0
+numpy>=1.24.0
+soundfile>=0.12.0
+onnxruntime
diff --git a/static/scripts/vibevoice-asr/run_all.sh b/static/scripts/vibevoice-asr/run_all.sh
new file mode 100755
index 0000000..0c53f01
--- /dev/null
+++ b/static/scripts/vibevoice-asr/run_all.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Run both Gradio demo and Realtime ASR server
+#
+# Usage:
+#   ./run_all.sh
+#
+# Ports:
+#   - 7860: Gradio UI (batch ASR)
+#   - 8000: WebSocket API (realtime ASR)
+
+set -e
+
+cd "$(dirname "$0")"
+
+# Configuration
+GRADIO_HOST="${GRADIO_HOST:-0.0.0.0}"
+GRADIO_PORT="${GRADIO_PORT:-7860}"
+REALTIME_HOST="${REALTIME_HOST:-0.0.0.0}"
+REALTIME_PORT="${REALTIME_PORT:-8000}"
+MODEL_PATH="${VIBEVOICE_MODEL_PATH:-microsoft/VibeVoice-ASR}"
+
+echo "=========================================="
+echo "VibeVoice ASR - All Services"
+echo "=========================================="
+echo ""
+echo "Starting services:"
+echo "  - Gradio UI:      http://$GRADIO_HOST:$GRADIO_PORT"
+echo "  - Realtime ASR:   http://$REALTIME_HOST:$REALTIME_PORT"
+echo "  - Test Client:    http://$REALTIME_HOST:$REALTIME_PORT/static/realtime_client.html"
+echo ""
+echo "Model: $MODEL_PATH"
+echo "=========================================="
+echo ""
+
+# Trap to clean up background processes on exit
+cleanup() {
+    echo ""
+    echo "Shutting down..."
+    kill $REALTIME_PID 2>/dev/null || true
+    kill $GRADIO_PID 2>/dev/null || true
+    wait
+    echo "All services stopped."
+}
+trap cleanup EXIT INT TERM
+
+# Start Realtime ASR server in background
+echo "[1/2] Starting Realtime ASR server..."
+python -m realtime.server \
+    --host "$REALTIME_HOST" \
+    --port "$REALTIME_PORT" \
+    --model-path "$MODEL_PATH" \
+    --no-preload &
+REALTIME_PID=$!
+
+# Wait a moment for the server to initialize
+sleep 2
+
+# Start Gradio demo in background
+echo "[2/2] Starting Gradio demo..."
+python demo/vibevoice_asr_gradio_demo.py \
+    --host "$GRADIO_HOST" \
+    --port "$GRADIO_PORT" \
+    --model_path "$MODEL_PATH" &
+GRADIO_PID=$!
+
+echo ""
+echo "Both services started. Press Ctrl+C to stop."
+echo ""
+
+# Wait for either process to exit
+wait -n $REALTIME_PID $GRADIO_PID
+
+# If one exits, the trap will clean up the other
diff --git a/static/scripts/vibevoice-asr/run_realtime.sh b/static/scripts/vibevoice-asr/run_realtime.sh
new file mode 100755
index 0000000..d8b7f89
--- /dev/null
+++ b/static/scripts/vibevoice-asr/run_realtime.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Run VibeVoice Realtime ASR Server
+#
+# Usage:
+#   ./run_realtime.sh [options]
+#
+# Options are passed to the server (see --help for details)
+
+set -e
+
+cd "$(dirname "$0")"
+
+# Default options
+HOST="${VIBEVOICE_HOST:-0.0.0.0}"
+PORT="${VIBEVOICE_PORT:-8000}"
+MODEL_PATH="${VIBEVOICE_MODEL_PATH:-microsoft/VibeVoice-ASR}"
+DEVICE="${VIBEVOICE_DEVICE:-cuda}"
+MAX_SESSIONS="${VIBEVOICE_MAX_SESSIONS:-10}"
+
+echo "=========================================="
+echo "VibeVoice Realtime ASR Server"
+echo "=========================================="
+echo "Host: $HOST"
+echo "Port: $PORT"
+echo "Model: $MODEL_PATH"
+echo "Device: $DEVICE"
+echo "Max Sessions: $MAX_SESSIONS"
+echo "=========================================="
+echo ""
+echo "Web client: http://$HOST:$PORT/static/realtime_client.html"
+echo "WebSocket: ws://$HOST:$PORT/ws/asr/{session_id}"
+echo ""
+
+# Run server
+python -m realtime.server \
+    --host "$HOST" \
+    --port "$PORT" \
+    --model-path "$MODEL_PATH" \
+    --device "$DEVICE" \
+    --max-sessions "$MAX_SESSIONS" \
+    "$@"
diff --git a/static/scripts/vibevoice-asr/setup.sh b/static/scripts/vibevoice-asr/setup.sh
new file mode 100644
index 0000000..df2a25e
--- /dev/null
+++ b/static/scripts/vibevoice-asr/setup.sh
@@ -0,0 +1,287 @@
+#!/bin/bash
+# VibeVoice-ASR Setup Script for DGX Spark
+# Downloads and builds the VibeVoice-ASR container
+#
+# Usage:
+#   curl -sL https://docs.techswan.online/scripts/vibevoice-asr/setup.sh | bash
+#   curl -sL https://docs.techswan.online/scripts/vibevoice-asr/setup.sh | bash -s build
+#   curl -sL https://docs.techswan.online/scripts/vibevoice-asr/setup.sh | bash -s serve
+
+set -e
+
+BASE_URL="https://docs.techswan.online/scripts/vibevoice-asr"
+INSTALL_DIR="${VIBEVOICE_DIR:-$HOME/vibevoice-asr}"
+IMAGE_NAME="vibevoice-asr:dgx-spark"
+CONTAINER_NAME="vibevoice-asr"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+log_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+log_step() {
+    echo -e "${BLUE}[STEP]${NC} $1"
+}
+
+download_file() {
+    local url="$1"
+    local dest="$2"
+    local dir=$(dirname "$dest")
+    
+    mkdir -p "$dir"
+    
+    if command -v curl &> /dev/null; then
+        curl -sL "$url" -o "$dest"
+    elif command -v wget &> /dev/null; then
+        wget -q "$url" -O "$dest"
+    else
+        log_error "curl or wget is required"
+        exit 1
+    fi
+}
+
+download_files() {
+    log_step "Downloading VibeVoice-ASR files to $INSTALL_DIR..."
+    
+    mkdir -p "$INSTALL_DIR"
+    cd "$INSTALL_DIR"
+    
+    # Core files
+    local files=(
+        "Dockerfile"
+        "requirements-realtime.txt"
+        "test_vibevoice.py"
+        "vibevoice_asr_gradio_demo_patched.py"
+        "run_realtime.sh"
+        "run_all.sh"
+    )
+    
+    for file in "${files[@]}"; do
+        log_info "Downloading $file..."
+        download_file "$BASE_URL/$file" "$INSTALL_DIR/$file"
+    done
+    
+    # Realtime module
+    local realtime_files=(
+        "__init__.py"
+        "models.py"
+        "server.py"
+        "asr_worker.py"
+        "session_manager.py"
+        "audio_buffer.py"
+        "vad_processor.py"
+    )
+    
+    mkdir -p "$INSTALL_DIR/realtime"
+    for file in "${realtime_files[@]}"; do
+        log_info "Downloading realtime/$file..."
+        download_file "$BASE_URL/realtime/$file" "$INSTALL_DIR/realtime/$file"
+    done
+    
+    # Static files
+    mkdir -p "$INSTALL_DIR/static"
+    log_info "Downloading static/realtime_client.html..."
+    download_file "$BASE_URL/static/realtime_client.html" "$INSTALL_DIR/static/realtime_client.html"
+    
+    # Make scripts executable
+    chmod +x "$INSTALL_DIR/run_realtime.sh" "$INSTALL_DIR/run_all.sh"
+    
+    log_info "All files downloaded to $INSTALL_DIR"
+}
+
+check_prerequisites() {
+    log_step "Checking prerequisites..."
+
+    # Check Docker
+    if ! command -v docker &> /dev/null; then
+        log_error "Docker is not installed"
+        exit 1
+    fi
+
+    # Check NVIDIA Docker runtime
+    if ! docker info 2>/dev/null | grep -q "Runtimes.*nvidia"; then
+        log_warn "NVIDIA Docker runtime may not be configured"
+    fi
+
+    # Check GPU availability
+    if command -v nvidia-smi &> /dev/null; then
+        log_info "GPU detected:"
+        nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
+    else
+        log_warn "nvidia-smi not found on host"
+    fi
+
+    log_info "Prerequisites check complete"
+}
+
+build_image() {
+    log_step "Building Docker image: ${IMAGE_NAME}"
+    log_info "This may take several minutes..."
+
+    cd "$INSTALL_DIR"
+
+    docker build \
+        --network=host \
+        -t "$IMAGE_NAME" \
+        -f Dockerfile \
+        .
+
+    log_info "Docker image built successfully: ${IMAGE_NAME}"
+}
+
+run_container() {
+    local mode="${1:-interactive}"
+
+    log_step "Running container in ${mode} mode..."
+
+    # Stop existing container if running
+    if docker ps -q -f name="$CONTAINER_NAME" | grep -q .; then
+        log_warn "Stopping existing container..."
+        docker stop "$CONTAINER_NAME" 2>/dev/null || true
+    fi
+
+    # Remove existing container
+    docker rm "$CONTAINER_NAME" 2>/dev/null || true
+
+    # Common Docker options for DGX Spark
+    local docker_opts=(
+        --gpus all
+        --ipc=host
+        --network=host
+        --ulimit memlock=-1:-1
+        --ulimit stack=-1:-1
+        -e "PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
+        -v "$HOME/.cache/huggingface:/root/.cache/huggingface"
+        --name "$CONTAINER_NAME"
+    )
+
+    if [ "$mode" = "interactive" ]; then
+        docker run --rm -it "${docker_opts[@]}" "$IMAGE_NAME" bash
+    elif [ "$mode" = "test" ]; then
+        docker run --rm "${docker_opts[@]}" "$IMAGE_NAME" python /workspace/test_vibevoice.py
+    elif [ "$mode" = "demo" ]; then
+        log_info "Starting Gradio demo on port 7860..."
+        log_info "Access the demo at: http://localhost:7860"
+        docker run --rm -it "${docker_opts[@]}" "$IMAGE_NAME"
+    elif [ "$mode" = "realtime" ]; then
+        log_info "Starting Realtime ASR server on port 8000..."
+        log_info "WebSocket API: ws://localhost:8000/ws/asr/{session_id}"
+        log_info "Test client:   http://localhost:8000/static/realtime_client.html"
+        docker run --rm -it "${docker_opts[@]}" "$IMAGE_NAME" \
+            python -m realtime.server --host 0.0.0.0 --port 8000
+    elif [ "$mode" = "serve" ]; then
+        log_info "Starting all services..."
+        log_info "  Gradio demo:    http://localhost:7860"
+        log_info "  Realtime ASR:   http://localhost:8000"
+        log_info "  Test client:    http://localhost:8000/static/realtime_client.html"
+        docker run --rm -it "${docker_opts[@]}" "$IMAGE_NAME" ./run_all.sh
+    else
+        log_error "Unknown mode: $mode"
+        exit 1
+    fi
+}
+
+show_usage() {
+    echo "VibeVoice-ASR Setup for DGX Spark"
+    echo ""
+    echo "Usage:"
+    echo "  curl -sL $BASE_URL/setup.sh | bash              # Download only"
+    echo "  curl -sL $BASE_URL/setup.sh | bash -s build     # Download and build"
+    echo "  curl -sL $BASE_URL/setup.sh | bash -s demo      # Download, build, run demo"
+    echo "  curl -sL $BASE_URL/setup.sh | bash -s serve     # Download, build, run all"
+    echo ""
+    echo "Commands:"
+    echo "  (default)   Download files only"
+    echo "  build       Download and build Docker image"
+    echo "  demo        Download, build, and start Gradio demo (port 7860)"
+    echo "  realtime    Download, build, and start Realtime ASR (port 8000)"
+    echo "  serve       Download, build, and start both services"
+    echo "  run         Run container interactively (after build)"
+    echo ""
+    echo "Environment variables:"
+    echo "  VIBEVOICE_DIR   Installation directory (default: ~/vibevoice-asr)"
+    echo ""
+    echo "After installation, you can also run:"
+    echo "  cd ~/vibevoice-asr"
+    echo "  docker run --gpus all -p 7860:7860 vibevoice-asr:dgx-spark"
+}
+
+main() {
+    local command="${1:-download}"
+
+    echo ""
+    echo "=========================================="
+    echo "  VibeVoice-ASR Setup for DGX Spark"
+    echo "=========================================="
+    echo ""
+
+    case "$command" in
+        download)
+            download_files
+            echo ""
+            log_info "Done! Next steps:"
+            echo "  cd $INSTALL_DIR"
+            echo "  docker build -t vibevoice-asr:dgx-spark ."
+            echo "  docker run --gpus all -p 7860:7860 vibevoice-asr:dgx-spark"
+            ;;
+        build)
+            download_files
+            check_prerequisites
+            build_image
+            echo ""
+            log_info "Done! To run:"
+            echo "  cd $INSTALL_DIR"
+            echo "  docker run --gpus all -p 7860:7860 vibevoice-asr:dgx-spark"
+            ;;
+        run)
+            cd "$INSTALL_DIR" 2>/dev/null || { log_error "Run 'build' first"; exit 1; }
+            run_container interactive
+            ;;
+        test)
+            cd "$INSTALL_DIR" 2>/dev/null || { log_error "Run 'build' first"; exit 1; }
+            run_container test
+            ;;
+        demo)
+            download_files
+            check_prerequisites
+            build_image
+            run_container demo
+            ;;
+        realtime)
+            download_files
+            check_prerequisites
+            build_image
+            run_container realtime
+            ;;
+        serve)
+            download_files
+            check_prerequisites
+            build_image
+            run_container serve
+            ;;
+        -h|--help|help)
+            show_usage
+            ;;
+        *)
+            log_error "Unknown command: $command"
+            show_usage
+            exit 1
+            ;;
+    esac
+}
+
+main "$@"
diff --git a/static/scripts/vibevoice-asr/static/realtime_client.html b/static/scripts/vibevoice-asr/static/realtime_client.html
new file mode 100644
index 0000000..4ff576c
--- /dev/null
+++ b/static/scripts/vibevoice-asr/static/realtime_client.html
@@ -0,0 +1,899 @@
+<!DOCTYPE html>
+<html lang="ja">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>VibeVoice Realtime ASR Client</title>
+    <style>
+        :root {
+            --bg-primary: #1a1a2e;
+            --bg-secondary: #16213e;
+            --bg-tertiary: #0f3460;
+            --text-primary: #eaeaea;
+            --text-secondary: #a0a0a0;
+            --accent: #e94560;
+            --success: #4ade80;
+            --warning: #fbbf24;
+            --info: #60a5fa;
+        }
+
+        * {
+            box-sizing: border-box;
+            margin: 0;
+            padding: 0;
+        }
+
+        body {
+            font-family: 'Segoe UI', system-ui, sans-serif;
+            background: var(--bg-primary);
+            color: var(--text-primary);
+            min-height: 100vh;
+            padding: 20px;
+        }
+
+        .container {
+            max-width: 1200px;
+            margin: 0 auto;
+        }
+
+        header {
+            text-align: center;
+            margin-bottom: 30px;
+        }
+
+        h1 {
+            font-size: 2rem;
+            margin-bottom: 10px;
+        }
+
+        .subtitle {
+            color: var(--text-secondary);
+        }
+
+        .main-grid {
+            display: grid;
+            grid-template-columns: 1fr 1fr;
+            gap: 20px;
+        }
+
+        @media (max-width: 768px) {
+            .main-grid {
+                grid-template-columns: 1fr;
+            }
+        }
+
+        .card {
+            background: var(--bg-secondary);
+            border-radius: 12px;
+            padding: 20px;
+        }
+
+        .card-title {
+            font-size: 1.1rem;
+            margin-bottom: 15px;
+            display: flex;
+            align-items: center;
+            gap: 8px;
+        }
+
+        /* Controls */
+        .controls {
+            display: flex;
+            flex-direction: column;
+            gap: 15px;
+        }
+
+        .control-row {
+            display: flex;
+            gap: 10px;
+            align-items: center;
+        }
+
+        input[type="text"], input[type="number"] {
+            background: var(--bg-tertiary);
+            border: 1px solid rgba(255,255,255,0.1);
+            border-radius: 8px;
+            padding: 10px 15px;
+            color: var(--text-primary);
+            flex: 1;
+        }
+
+        input:focus {
+            outline: none;
+            border-color: var(--accent);
+        }
+
+        button {
+            background: var(--accent);
+            border: none;
+            border-radius: 8px;
+            padding: 12px 24px;
+            color: white;
+            font-weight: 600;
+            cursor: pointer;
+            transition: all 0.2s;
+        }
+
+        button:hover {
+            filter: brightness(1.1);
+        }
+
+        button:disabled {
+            opacity: 0.5;
+            cursor: not-allowed;
+        }
+
+        button.secondary {
+            background: var(--bg-tertiary);
+        }
+
+        button.success {
+            background: var(--success);
+            color: #000;
+        }
+
+        button.stop {
+            background: #ef4444;
+        }
+
+        /* Status */
+        .status-bar {
+            display: flex;
+            align-items: center;
+            gap: 10px;
+            padding: 10px 15px;
+            background: var(--bg-tertiary);
+            border-radius: 8px;
+            margin-bottom: 15px;
+        }
+
+        .status-indicator {
+            width: 12px;
+            height: 12px;
+            border-radius: 50%;
+            background: var(--text-secondary);
+        }
+
+        .status-indicator.connected {
+            background: var(--success);
+            box-shadow: 0 0 10px var(--success);
+        }
+
+        .status-indicator.recording {
+            background: var(--accent);
+            box-shadow: 0 0 10px var(--accent);
+            animation: pulse 1s infinite;
+        }
+
+        @keyframes pulse {
+            0%, 100% { opacity: 1; }
+            50% { opacity: 0.5; }
+        }
+
+        /* Transcription output */
+        .transcription-box {
+            background: var(--bg-tertiary);
+            border-radius: 8px;
+            padding: 15px;
+            min-height: 300px;
+            max-height: 500px;
+            overflow-y: auto;
+            font-family: 'Consolas', monospace;
+            line-height: 1.6;
+        }
+
+        .segment {
+            margin-bottom: 15px;
+            padding: 10px;
+            background: rgba(255,255,255,0.05);
+            border-radius: 6px;
+            border-left: 3px solid var(--accent);
+        }
+
+        .segment.partial {
+            border-left-color: var(--warning);
+            opacity: 0.8;
+        }
+
+        .segment-meta {
+            font-size: 0.8rem;
+            color: var(--text-secondary);
+            margin-bottom: 5px;
+        }
+
+        .segment-text {
+            font-size: 1rem;
+        }
+
+        /* Audio visualizer */
+        .visualizer-container {
+            height: 80px;
+            background: var(--bg-tertiary);
+            border-radius: 8px;
+            overflow: hidden;
+            margin-bottom: 15px;
+        }
+
+        #visualizer {
+            width: 100%;
+            height: 100%;
+        }
+
+        /* VAD indicator */
+        .vad-indicator {
+            display: flex;
+            align-items: center;
+            gap: 10px;
+            padding: 10px;
+            background: var(--bg-tertiary);
+            border-radius: 8px;
+            margin-bottom: 15px;
+        }
+
+        .vad-bar {
+            flex: 1;
+            height: 8px;
+            background: rgba(255,255,255,0.1);
+            border-radius: 4px;
+            overflow: hidden;
+        }
+
+        .vad-level {
+            height: 100%;
+            background: var(--success);
+            width: 0%;
+            transition: width 0.1s;
+        }
+
+        .vad-level.speech {
+            background: var(--accent);
+        }
+
+        /* Stats */
+        .stats {
+            display: grid;
+            grid-template-columns: repeat(3, 1fr);
+            gap: 10px;
+            margin-top: 15px;
+        }
+
+        .stat-item {
+            background: var(--bg-tertiary);
+            padding: 10px;
+            border-radius: 8px;
+            text-align: center;
+        }
+
+        .stat-value {
+            font-size: 1.5rem;
+            font-weight: bold;
+            color: var(--accent);
+        }
+
+        .stat-label {
+            font-size: 0.75rem;
+            color: var(--text-secondary);
+        }
+
+        /* Log */
+        .log-box {
+            background: var(--bg-tertiary);
+            border-radius: 8px;
+            padding: 10px;
+            height: 150px;
+            overflow-y: auto;
+            font-family: 'Consolas', monospace;
+            font-size: 0.8rem;
+        }
+
+        .log-entry {
+            margin-bottom: 2px;
+            color: var(--text-secondary);
+        }
+
+        .log-entry.error {
+            color: #ef4444;
+        }
+
+        .log-entry.success {
+            color: var(--success);
+        }
+
+        .log-entry.info {
+            color: var(--info);
+        }
+
+        /* Settings panel */
+        .settings-group {
+            margin-top: 15px;
+            padding: 15px;
+            background: var(--bg-tertiary);
+            border-radius: 8px;
+        }
+
+        .settings-group h4 {
+            margin-bottom: 10px;
+            font-size: 0.9rem;
+            color: var(--text-secondary);
+        }
+
+        .setting-item {
+            margin-bottom: 12px;
+        }
+
+        .setting-item label {
+            display: block;
+            font-size: 0.85rem;
+            margin-bottom: 5px;
+            color: var(--text-secondary);
+        }
+
+        .setting-item input[type="range"] {
+            width: 100%;
+            height: 6px;
+            border-radius: 3px;
+            background: rgba(255,255,255,0.1);
+            outline: none;
+            -webkit-appearance: none;
+        }
+
+        .setting-item input[type="range"]::-webkit-slider-thumb {
+            -webkit-appearance: none;
+            width: 16px;
+            height: 16px;
+            border-radius: 50%;
+            background: var(--accent);
+            cursor: pointer;
+        }
+
+        .setting-value {
+            font-size: 0.8rem;
+            color: var(--accent);
+            float: right;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1>🎙️ VibeVoice Realtime ASR</h1>
+            <p class="subtitle">リアルタイム音声認識デモ</p>
+        </header>
+
+        <div class="status-bar">
+            <div class="status-indicator" id="statusIndicator"></div>
+            <span id="statusText">未接続</span>
+        </div>
+
+        <div class="main-grid">
+            <!-- Left column: Controls -->
+            <div class="card">
+                <h2 class="card-title">⚙️ 設定</h2>
+                <div class="controls">
+                    <div class="control-row">
+                        <input type="text" id="serverUrl" placeholder="WebSocket URL"
+                               value="ws://localhost:8000/ws/asr/demo">
+                    </div>
+                    <div class="control-row">
+                        <button id="connectBtn" onclick="toggleConnection()">接続</button>
+                        <button id="recordBtn" class="success" onclick="toggleRecording()" disabled>
+                            🎤 録音開始
+                        </button>
+                    </div>
+                </div>
+
+                <div class="visualizer-container">
+                    <canvas id="visualizer"></canvas>
+                </div>
+
+                <div class="vad-indicator">
+                    <span>VAD:</span>
+                    <div class="vad-bar">
+                        <div class="vad-level" id="vadLevel"></div>
+                    </div>
+                    <span id="vadStatus">待機中</span>
+                </div>
+
+                <div class="settings-group">
+                    <h4>🎚️ VAD設定</h4>
+                    <div class="setting-item">
+                        <label>
+                            音声検出閾値
+                            <span class="setting-value" id="vadThresholdValue">0.5</span>
+                        </label>
+                        <input type="range" id="vadThreshold" min="0.1" max="0.9" step="0.1" value="0.5"
+                               onchange="updateConfig()">
+                    </div>
+                    <div class="setting-item">
+                        <label>
+                            最小発話時間 (ms)
+                            <span class="setting-value" id="minSpeechValue">250</span>
+                        </label>
+                        <input type="range" id="minSpeechDuration" min="100" max="1000" step="50" value="250"
+                               onchange="updateConfig()">
+                    </div>
+                    <div class="setting-item">
+                        <label>
+                            無音判定時間 (ms)
+                            <span class="setting-value" id="minSilenceValue">500</span>
+                        </label>
+                        <input type="range" id="minSilenceDuration" min="200" max="2000" step="100" value="500"
+                               onchange="updateConfig()">
+                    </div>
+                    <div class="setting-item">
+                        <label>
+                            最小音量閾値
+                            <span class="setting-value" id="minVolumeValue">0.01</span>
+                        </label>
+                        <input type="range" id="minVolumeThreshold" min="0.001" max="0.1" step="0.001" value="0.01"
+                               onchange="updateConfig()">
+                    </div>
+                </div>
+
+                <div class="stats">
+                    <div class="stat-item">
+                        <div class="stat-value" id="statDuration">0.0</div>
+                        <div class="stat-label">録音時間 (秒)</div>
+                    </div>
+                    <div class="stat-item">
+                        <div class="stat-value" id="statSegments">0</div>
+                        <div class="stat-label">セグメント数</div>
+                    </div>
+                    <div class="stat-item">
+                        <div class="stat-value" id="statLatency">-</div>
+                        <div class="stat-label">レイテンシ (ms)</div>
+                    </div>
+                </div>
+
+                <h3 class="card-title" style="margin-top: 20px;">📋 ログ</h3>
+                <div class="log-box" id="logBox"></div>
+            </div>
+
+            <!-- Right column: Transcription -->
+            <div class="card">
+                <h2 class="card-title">📝 認識結果</h2>
+                <div class="control-row" style="margin-bottom: 15px;">
+                    <button class="secondary" onclick="clearTranscription()">クリア</button>
+                    <button class="secondary" onclick="copyTranscription()">コピー</button>
+                </div>
+                <div class="transcription-box" id="transcriptionBox">
+                    <p style="color: var(--text-secondary); text-align: center; padding: 50px;">
+                        接続して録音を開始すると、ここに認識結果が表示されます
+                    </p>
+                </div>
+            </div>
+        </div>
+    </div>
+
+    <script>
+        // State
+        let websocket = null;
+        let mediaStream = null;
+        let audioContext = null;
+        let processor = null;
+        let analyser = null;
+        let isRecording = false;
+        let recordingStartTime = null;
+        let segmentCount = 0;
+        let currentPartialText = '';
+
+        // DOM elements
+        const statusIndicator = document.getElementById('statusIndicator');
+        const statusText = document.getElementById('statusText');
+        const connectBtn = document.getElementById('connectBtn');
+        const recordBtn = document.getElementById('recordBtn');
+        const transcriptionBox = document.getElementById('transcriptionBox');
+        const logBox = document.getElementById('logBox');
+        const vadLevel = document.getElementById('vadLevel');
+        const vadStatus = document.getElementById('vadStatus');
+
+        // Logging
+        function log(message, type = 'info') {
+            const entry = document.createElement('div');
+            entry.className = `log-entry ${type}`;
+            entry.textContent = `[${new Date().toLocaleTimeString()}] ${message}`;
+            logBox.appendChild(entry);
+            logBox.scrollTop = logBox.scrollHeight;
+            console.log(`[${type}] ${message}`);
+        }
+
+        // Update config display values
+        function updateConfigDisplay() {
+            document.getElementById('vadThresholdValue').textContent =
+                document.getElementById('vadThreshold').value;
+            document.getElementById('minSpeechValue').textContent =
+                document.getElementById('minSpeechDuration').value;
+            document.getElementById('minSilenceValue').textContent =
+                document.getElementById('minSilenceDuration').value;
+            document.getElementById('minVolumeValue').textContent =
+                document.getElementById('minVolumeThreshold').value;
+        }
+
+        // Send config to server
+        function updateConfig() {
+            updateConfigDisplay();
+
+            if (websocket && websocket.readyState === WebSocket.OPEN) {
+                const config = {
+                    type: 'config',
+                    config: {
+                        vad_threshold: parseFloat(document.getElementById('vadThreshold').value),
+                        min_speech_duration_ms: parseInt(document.getElementById('minSpeechDuration').value),
+                        min_silence_duration_ms: parseInt(document.getElementById('minSilenceDuration').value),
+                        min_volume_threshold: parseFloat(document.getElementById('minVolumeThreshold').value),
+                    }
+                };
+                websocket.send(JSON.stringify(config));
+                log(`VAD設定を更新: 閾値=${config.config.vad_threshold}, 最小発話=${config.config.min_speech_duration_ms}ms, 無音判定=${config.config.min_silence_duration_ms}ms, 最小音量=${config.config.min_volume_threshold}`, 'info');
+            }
+        }
+
+        // Initialize config display on load
+        document.addEventListener('DOMContentLoaded', updateConfigDisplay);
+
+        // Connection
+        function toggleConnection() {
+            if (websocket && websocket.readyState === WebSocket.OPEN) {
+                disconnect();
+            } else {
+                connect();
+            }
+        }
+
+        function connect() {
+            const url = document.getElementById('serverUrl').value;
+            log(`接続中: ${url}`, 'info');
+
+            try {
+                websocket = new WebSocket(url);
+
+                websocket.onopen = () => {
+                    log('接続成功', 'success');
+                    statusIndicator.classList.add('connected');
+                    statusText.textContent = '接続済み';
+                    connectBtn.textContent = '切断';
+                    recordBtn.disabled = false;
+                };
+
+                websocket.onclose = () => {
+                    log('切断されました', 'info');
+                    handleDisconnect();
+                };
+
+                websocket.onerror = (error) => {
+                    log(`エラー: ${error}`, 'error');
+                };
+
+                websocket.onmessage = (event) => {
+                    handleMessage(JSON.parse(event.data));
+                };
+
+            } catch (error) {
+                log(`接続エラー: ${error}`, 'error');
+            }
+        }
+
+        function disconnect() {
+            if (isRecording) {
+                stopRecording();
+            }
+            if (websocket) {
+                websocket.close();
+            }
+            handleDisconnect();
+        }
+
+        function handleDisconnect() {
+            statusIndicator.classList.remove('connected', 'recording');
+            statusText.textContent = '未接続';
+            connectBtn.textContent = '接続';
+            recordBtn.disabled = true;
+            websocket = null;
+        }
+
+        // Message handling
+        function handleMessage(data) {
+            switch (data.type) {
+                case 'status':
+                    log(`ステータス: ${data.message}`, 'info');
+                    break;
+
+                case 'partial_result':
+                    updatePartialResult(data);
+                    break;
+
+                case 'final_result':
+                    addFinalResult(data);
+                    break;
+
+                case 'vad_event':
+                    handleVADEvent(data);
+                    break;
+
+                case 'error':
+                    log(`サーバーエラー: ${data.error}`, 'error');
+                    break;
+
+                case 'pong':
+                    // Heartbeat response
+                    break;
+
+                default:
+                    log(`不明なメッセージ: ${data.type}`, 'info');
+            }
+        }
+
+        function updatePartialResult(data) {
+            currentPartialText = data.text;
+            updateTranscriptionDisplay();
+
+            if (data.latency_ms) {
+                document.getElementById('statLatency').textContent =
+                    Math.round(data.latency_ms);
+            }
+        }
+
+        function addFinalResult(data) {
+            currentPartialText = '';
+            segmentCount++;
+            document.getElementById('statSegments').textContent = segmentCount;
+
+            if (data.latency_ms) {
+                document.getElementById('statLatency').textContent =
+                    Math.round(data.latency_ms);
+            }
+
+            // Add final segment to display
+            const segment = document.createElement('div');
+            segment.className = 'segment';
+
+            let metaText = '';
+            if (data.segments && data.segments.length > 0) {
+                const seg = data.segments[0];
+                metaText = `[${seg.start_time?.toFixed(2) || '?'}s - ${seg.end_time?.toFixed(2) || '?'}s] ${seg.speaker_id || ''}`;
+            }
+
+            segment.innerHTML = `
+                <div class="segment-meta">${metaText}</div>
+                <div class="segment-text">${data.text}</div>
+            `;
+
+            // Remove placeholder if exists
+            const placeholder = transcriptionBox.querySelector('p');
+            if (placeholder) placeholder.remove();
+
+            transcriptionBox.appendChild(segment);
+            transcriptionBox.scrollTop = transcriptionBox.scrollHeight;
+
+            log(`認識完了: "${data.text.substring(0, 30)}..."`, 'success');
+        }
+
+        function updateTranscriptionDisplay() {
+            // Update or create partial display
+            let partialDiv = transcriptionBox.querySelector('.segment.partial');
+
+            if (currentPartialText) {
+                if (!partialDiv) {
+                    partialDiv = document.createElement('div');
+                    partialDiv.className = 'segment partial';
+                    transcriptionBox.appendChild(partialDiv);
+                }
+                partialDiv.innerHTML = `
+                    <div class="segment-meta">認識中...</div>
+                    <div class="segment-text">${currentPartialText}</div>
+                `;
+                transcriptionBox.scrollTop = transcriptionBox.scrollHeight;
+            } else if (partialDiv) {
+                partialDiv.remove();
+            }
+        }
+
+        function handleVADEvent(data) {
+            if (data.event === 'speech_start') {
+                vadLevel.classList.add('speech');
+                vadStatus.textContent = '発話中';
+                log(`発話開始 @ ${data.audio_timestamp_sec?.toFixed(2)}s`, 'info');
+            } else if (data.event === 'speech_end') {
+                vadLevel.classList.remove('speech');
+                vadStatus.textContent = '待機中';
+                log(`発話終了 @ ${data.audio_timestamp_sec?.toFixed(2)}s`, 'info');
+            }
+        }
+
+        // Recording
+        async function toggleRecording() {
+            if (isRecording) {
+                stopRecording();
+            } else {
+                await startRecording();
+            }
+        }
+
+        async function startRecording() {
+            try {
+                log('マイクアクセスをリクエスト中...', 'info');
+
+                mediaStream = await navigator.mediaDevices.getUserMedia({
+                    audio: {
+                        sampleRate: 16000,
+                        channelCount: 1,
+                        echoCancellation: true,
+                        noiseSuppression: true,
+                    }
+                });
+
+                audioContext = new (window.AudioContext || window.webkitAudioContext)({
+                    sampleRate: 16000
+                });
+
+                const source = audioContext.createMediaStreamSource(mediaStream);
+
+                // Analyser for visualization
+                analyser = audioContext.createAnalyser();
+                analyser.fftSize = 256;
+                source.connect(analyser);
+
+                // ScriptProcessor for sending audio
+                const bufferSize = 4096;
+                processor = audioContext.createScriptProcessor(bufferSize, 1, 1);
+
+                processor.onaudioprocess = (e) => {
+                    if (!isRecording || !websocket || websocket.readyState !== WebSocket.OPEN) {
+                        return;
+                    }
+
+                    const inputData = e.inputBuffer.getChannelData(0);
+
+                    // Convert to 16-bit PCM
+                    const pcmData = new Int16Array(inputData.length);
+                    for (let i = 0; i < inputData.length; i++) {
+                        pcmData[i] = Math.max(-32768, Math.min(32767, inputData[i] * 32768));
+                    }
+
+                    // Send as binary
+                    websocket.send(pcmData.buffer);
+                };
+
+                source.connect(processor);
+                processor.connect(audioContext.destination);
+
+                isRecording = true;
+                recordingStartTime = Date.now();
+                statusIndicator.classList.add('recording');
+                recordBtn.textContent = '⏹️ 録音停止';
+                recordBtn.classList.remove('success');
+                recordBtn.classList.add('stop');
+
+                // Start visualization
+                visualize();
+                updateDuration();
+
+                log('録音開始', 'success');
+
+            } catch (error) {
+                log(`マイクエラー: ${error}`, 'error');
+            }
+        }
+
+        function stopRecording() {
+            isRecording = false;
+
+            if (processor) {
+                processor.disconnect();
+                processor = null;
+            }
+
+            if (audioContext) {
+                audioContext.close();
+                audioContext = null;
+            }
+
+            if (mediaStream) {
+                mediaStream.getTracks().forEach(track => track.stop());
+                mediaStream = null;
+            }
+
+            statusIndicator.classList.remove('recording');
+            recordBtn.textContent = '🎤 録音開始';
+            recordBtn.classList.remove('stop');
+            recordBtn.classList.add('success');
+
+            // Send stop message
+            if (websocket && websocket.readyState === WebSocket.OPEN) {
+                websocket.send(JSON.stringify({ type: 'stop' }));
+            }
+
+            log('録音停止', 'info');
+        }
+
+        // Visualization
+        function visualize() {
+            if (!analyser || !isRecording) return;
+
+            const canvas = document.getElementById('visualizer');
+            const ctx = canvas.getContext('2d');
+            const width = canvas.width = canvas.offsetWidth;
+            const height = canvas.height = canvas.offsetHeight;
+
+            const bufferLength = analyser.frequencyBinCount;
+            const dataArray = new Uint8Array(bufferLength);
+
+            function draw() {
+                if (!isRecording) return;
+                requestAnimationFrame(draw);
+
+                analyser.getByteFrequencyData(dataArray);
+
+                ctx.fillStyle = 'rgb(15, 52, 96)';
+                ctx.fillRect(0, 0, width, height);
+
+                const barWidth = (width / bufferLength) * 2.5;
+                let x = 0;
+
+                // Calculate average for VAD indicator
+                let sum = 0;
+                for (let i = 0; i < bufferLength; i++) {
+                    const barHeight = (dataArray[i] / 255) * height;
+
+                    const gradient = ctx.createLinearGradient(0, height, 0, height - barHeight);
+                    gradient.addColorStop(0, '#e94560');
+                    gradient.addColorStop(1, '#4ade80');
+
+                    ctx.fillStyle = gradient;
+                    ctx.fillRect(x, height - barHeight, barWidth, barHeight);
+
+                    x += barWidth + 1;
+                    sum += dataArray[i];
+                }
+
+                // Update VAD level indicator
+                const avgLevel = (sum / bufferLength / 255) * 100;
+                vadLevel.style.width = `${avgLevel}%`;
+            }
+
+            draw();
+        }
+
+        function updateDuration() {
+            if (!isRecording) return;
+
+            const duration = (Date.now() - recordingStartTime) / 1000;
+            document.getElementById('statDuration').textContent = duration.toFixed(1);
+
+            requestAnimationFrame(updateDuration);
+        }
+
+        // Utility functions
+        function clearTranscription() {
+            transcriptionBox.innerHTML = `
+                <p style="color: var(--text-secondary); text-align: center; padding: 50px;">
+                    接続して録音を開始すると、ここに認識結果が表示されます
+                </p>
+            `;
+            segmentCount = 0;
+            currentPartialText = '';
+            document.getElementById('statSegments').textContent = '0';
+            document.getElementById('statLatency').textContent = '-';
+            log('認識結果をクリアしました', 'info');
+        }
+
+        function copyTranscription() {
+            const segments = transcriptionBox.querySelectorAll('.segment:not(.partial) .segment-text');
+            const text = Array.from(segments).map(s => s.textContent).join('\n');
+
+            if (text) {
+                navigator.clipboard.writeText(text).then(() => {
+                    log('認識結果をコピーしました', 'success');
+                });
+            }
+        }
+
+        // Heartbeat
+        setInterval(() => {
+            if (websocket && websocket.readyState === WebSocket.OPEN) {
+                websocket.send(JSON.stringify({ type: 'ping' }));
+            }
+        }, 30000);
+    </script>
+</body>
+</html>
diff --git a/static/scripts/vibevoice-asr/test_vibevoice.py b/static/scripts/vibevoice-asr/test_vibevoice.py
new file mode 100644
index 0000000..f7de53e
--- /dev/null
+++ b/static/scripts/vibevoice-asr/test_vibevoice.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""
+VibeVoice-ASR Test Script for DGX Spark
+Tests basic functionality and GPU availability
+"""
+
+import sys
+import subprocess
+
+
+def test_imports():
+    """Test that VibeVoice can be imported"""
+    print("=" * 60)
+    print("Testing VibeVoice imports...")
+    print("=" * 60)
+
+    try:
+        import vibevoice
+        print("[OK] vibevoice imported successfully")
+        return True
+    except ImportError as e:
+        print(f"[FAIL] Failed to import vibevoice: {e}")
+        return False
+
+
+def test_torch_cuda():
+    """Test PyTorch CUDA availability"""
+    print("\n" + "=" * 60)
+    print("Testing PyTorch CUDA...")
+    print("=" * 60)
+
+    try:
+        import torch
+        print(f"[INFO] PyTorch version: {torch.__version__}")
+        print(f"[INFO] CUDA available: {torch.cuda.is_available()}")
+
+        if torch.cuda.is_available():
+            print(f"[INFO] CUDA version: {torch.version.cuda}")
+            print(f"[INFO] GPU count: {torch.cuda.device_count()}")
+
+            for i in range(torch.cuda.device_count()):
+                props = torch.cuda.get_device_properties(i)
+                print(f"[INFO] GPU {i}: {props.name}")
+                print(f"       Compute capability: {props.major}.{props.minor}")
+                print(f"       Total memory: {props.total_memory / 1024**3:.1f} GB")
+
+            # Quick CUDA test
+            x = torch.randn(100, 100, device='cuda')
+            y = torch.matmul(x, x)
+            print(f"[OK] CUDA tensor operations working")
+            return True
+        else:
+            print("[WARN] CUDA not available")
+            return False
+
+    except Exception as e:
+        print(f"[FAIL] PyTorch CUDA test failed: {e}")
+        return False
+
+
+def test_flash_attention():
+    """Test flash attention availability"""
+    print("\n" + "=" * 60)
+    print("Testing Flash Attention...")
+    print("=" * 60)
+
+    try:
+        import flash_attn
+        print(f"[OK] flash_attn version: {flash_attn.__version__}")
+        return True
+    except ImportError:
+        print("[WARN] flash_attn not installed (optional)")
+        return True  # Not required
+
+
+def test_ffmpeg():
+    """Test FFmpeg availability"""
+    print("\n" + "=" * 60)
+    print("Testing FFmpeg...")
+    print("=" * 60)
+
+    try:
+        result = subprocess.run(
+            ["ffmpeg", "-version"],
+            capture_output=True,
+            text=True
+        )
+        if result.returncode == 0:
+            version_line = result.stdout.split('\n')[0]
+            print(f"[OK] {version_line}")
+            return True
+        else:
+            print("[FAIL] FFmpeg returned error")
+            return False
+    except FileNotFoundError:
+        print("[FAIL] FFmpeg not found")
+        return False
+
+
+def test_asr_model():
+    """Test loading ASR model (if GPU available)"""
+    print("\n" + "=" * 60)
+    print("Testing ASR Model Loading...")
+    print("=" * 60)
+
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            print("[SKIP] Skipping model test - no GPU available")
+            return True
+
+        # Try to load the ASR pipeline
+        from vibevoice import ASRPipeline
+        print("[INFO] Loading ASR pipeline...")
+
+        # Use smaller model for testing
+        pipeline = ASRPipeline()
+        print("[OK] ASR pipeline loaded successfully")
+
+        # Clean up
+        del pipeline
+        torch.cuda.empty_cache()
+
+        return True
+
+    except ImportError as e:
+        print(f"[WARN] ASRPipeline not available: {e}")
+        print("[INFO] This may be normal depending on VibeVoice version")
+        return True
+    except Exception as e:
+        print(f"[WARN] ASR model test: {e}")
+        return True
+
+
+def main():
+    """Run all tests"""
+    print("\n")
+    print("*" * 60)
+    print("  VibeVoice-ASR Test Suite for DGX Spark")
+    print("*" * 60)
+
+    results = {
+        "imports": test_imports(),
+        "torch_cuda": test_torch_cuda(),
+        "flash_attention": test_flash_attention(),
+        "ffmpeg": test_ffmpeg(),
+        "asr_model": test_asr_model(),
+    }
+
+    print("\n")
+    print("=" * 60)
+    print("Test Summary")
+    print("=" * 60)
+
+    all_passed = True
+    for name, passed in results.items():
+        status = "[OK]" if passed else "[FAIL]"
+        print(f"  {status} {name}")
+        if not passed:
+            all_passed = False
+
+    print("=" * 60)
+
+    if all_passed:
+        print("\nAll tests passed!")
+        return 0
+    else:
+        print("\nSome tests failed.")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/static/scripts/vibevoice-asr/vibevoice_asr_gradio_demo_patched.py b/static/scripts/vibevoice-asr/vibevoice_asr_gradio_demo_patched.py
new file mode 100644
index 0000000..e414002
--- /dev/null
+++ b/static/scripts/vibevoice-asr/vibevoice_asr_gradio_demo_patched.py
@@ -0,0 +1,1229 @@
+
+
+#!/usr/bin/env python
+"""
+VibeVoice ASR Gradio Demo
+(Patched for MKV and video file support)
+"""
+
+import os
+import sys
+import torch
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+import argparse
+import time
+import json
+import gradio as gr
+from typing import List, Dict, Tuple, Optional, Generator
+import tempfile
+import base64
+import io
+import traceback
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# Import TextIteratorStreamer for streaming generation
+from transformers import TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
+
+try:
+    from liger_kernel.transformers import apply_liger_kernel_to_qwen2
+    # Only apply RoPE, RMSNorm, SwiGLU patches (these affect the underlying Qwen2 layers)
+    apply_liger_kernel_to_qwen2(
+        rope=True,
+        rms_norm=True,
+        swiglu=True,
+        cross_entropy=False,
+    )
+    print("✅ Liger Kernel applied to Qwen2 components (RoPE, RMSNorm, SwiGLU)")
+except Exception as e:
+    print(f"⚠️ Failed to apply Liger Kernel: {e}, you can install it with: pip install liger-kernel")
+    
+# Try to import pydub for MP3 conversion
+try:
+    from pydub import AudioSegment
+    HAS_PYDUB = True
+except ImportError:
+    HAS_PYDUB = False
+    print("⚠️ Warning: pydub not available, falling back to WAV format")
+
+from vibevoice.modular.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration
+from vibevoice.processor.vibevoice_asr_processor import VibeVoiceASRProcessor
+from vibevoice.processor.audio_utils import load_audio_use_ffmpeg, COMMON_AUDIO_EXTS
+
+# Add MKV and additional video format support
+MKV_EXTS = ['.mkv', '.MKV', '.Mkv', '.avi', '.AVI']
+for ext in MKV_EXTS:
+    if ext not in COMMON_AUDIO_EXTS:
+        COMMON_AUDIO_EXTS.append(ext)
+print(f"✅ MKV/Video support enabled. Total supported formats: {len(COMMON_AUDIO_EXTS)}")
+
+
+class VibeVoiceASRInference:
+    """Simple inference wrapper for VibeVoice ASR model."""
+    
+    def __init__(self, model_path: str, device: str = "cuda", dtype: torch.dtype = torch.bfloat16, attn_implementation: str = "flash_attention_2"):
+        """
+        Initialize the ASR inference pipeline.
+        
+        Args:
+            model_path: Path to the pretrained model (HuggingFace format directory or model name)
+            device: Device to run inference on
+            dtype: Data type for model weights
+            attn_implementation: Attention implementation to use ('flash_attention_2', 'sdpa', 'eager')
+        """
+        print(f"Loading VibeVoice ASR model from {model_path}")
+        
+        # Load processor
+        self.processor = VibeVoiceASRProcessor.from_pretrained(model_path)
+        
+        # Load model
+        print(f"Using attention implementation: {attn_implementation}")
+        self.model = VibeVoiceASRForConditionalGeneration.from_pretrained(
+            model_path,
+            dtype=dtype,
+            device_map=device if device == "auto" else None,
+            attn_implementation=attn_implementation,
+            trust_remote_code=True
+        )
+        
+        if device != "auto":
+            self.model = self.model.to(device)
+        
+        self.device = device if device != "auto" else next(self.model.parameters()).device
+        self.model.eval()
+        
+        # Print model info
+        total_params = sum(p.numel() for p in self.model.parameters())
+        print(f"✅ Model loaded successfully on {self.device}")
+        print(f"📊 Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
+    
+    def transcribe(
+        self, 
+        audio_path: str = None,
+        audio_array: np.ndarray = None,
+        sample_rate: int = None,
+        max_new_tokens: int = 512,
+        temperature: float = 0.0,
+        top_p: float = 1.0,
+        do_sample: bool = False,
+        num_beams: int = 1,
+        repetition_penalty: float = 1.0,
+        context_info: str = None,
+        streamer: Optional[TextIteratorStreamer] = None,
+    ) -> dict:
+        """
+        Transcribe audio to text.
+        
+        Args:
+            audio_path: Path to audio file
+            audio_array: Audio array (if not loading from file)
+            sample_rate: Sample rate of audio array
+            max_new_tokens: Maximum tokens to generate
+            temperature: Temperature for sampling (0 for greedy)
+            top_p: Top-p for nucleus sampling (1.0 for no filtering)
+            do_sample: Whether to use sampling
+            num_beams: Number of beams for beam search (1 for greedy)
+            repetition_penalty: Repetition penalty (1.0 for no penalty)
+            context_info: Optional context information (e.g., hotwords, speaker names, topics) to help transcription
+            streamer: Optional TextIteratorStreamer for streaming output
+            
+        Returns:
+            Dictionary with transcription results
+        """
+        # Process audio
+        inputs = self.processor(
+            audio=audio_path,
+            sampling_rate=sample_rate,
+            return_tensors="pt",
+            add_generation_prompt=True,
+            context_info=context_info
+        )
+        
+        # Move to device
+        inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v 
+                  for k, v in inputs.items()}
+        
+        # Generate
+        generation_config = {
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature if temperature > 0 else None,
+            "top_p": top_p if do_sample else None,
+            "do_sample": do_sample,
+            "num_beams": num_beams,
+            "repetition_penalty": repetition_penalty,
+            "pad_token_id": self.processor.pad_id,
+            "eos_token_id": self.processor.tokenizer.eos_token_id,
+        }
+        
+        # Add streamer if provided
+        if streamer is not None:
+            generation_config["streamer"] = streamer
+        
+        # Add stopping criteria for stop button support
+        generation_config["stopping_criteria"] = StoppingCriteriaList([StopOnFlag()])
+        
+        # Remove None values
+        generation_config = {k: v for k, v in generation_config.items() if v is not None}
+        
+        start_time = time.time()
+        
+        # Calculate input token statistics before generation
+        input_ids = inputs['input_ids'][0]  # Shape: [seq_len]
+        total_input_tokens = input_ids.shape[0]
+        
+        # Count padding tokens (tokens equal to pad_id)
+        pad_id = self.processor.pad_id
+        padding_mask = (input_ids == pad_id)
+        num_padding_tokens = padding_mask.sum().item()
+        
+        # Count speech tokens (tokens between speech_start_id and speech_end_id)
+        speech_start_id = self.processor.speech_start_id
+        speech_end_id = self.processor.speech_end_id
+        
+        # Find speech regions
+        input_ids_list = input_ids.tolist()
+        num_speech_tokens = 0
+        in_speech = False
+        for token_id in input_ids_list:
+            if token_id == speech_start_id:
+                in_speech = True
+                num_speech_tokens += 1  # Count speech_start token
+            elif token_id == speech_end_id:
+                in_speech = False
+                num_speech_tokens += 1  # Count speech_end token
+            elif in_speech:
+                num_speech_tokens += 1
+        
+        # Text tokens = total - speech - padding
+        num_text_tokens = total_input_tokens - num_speech_tokens - num_padding_tokens
+        
+        with torch.no_grad():
+            output_ids = self.model.generate(
+                **inputs,
+                **generation_config
+            )
+        
+        generation_time = time.time() - start_time
+        
+        # Decode output
+        generated_ids = output_ids[0, inputs['input_ids'].shape[1]:]
+        generated_text = self.processor.decode(generated_ids, skip_special_tokens=True)
+        
+        # Parse structured output
+        try:
+            transcription_segments = self.processor.post_process_transcription(generated_text)
+        except Exception as e:
+            print(f"Warning: Failed to parse structured output: {e}")
+            transcription_segments = []
+        
+        return {
+            "raw_text": generated_text,
+            "segments": transcription_segments,
+            "generation_time": generation_time,
+            "input_tokens": {
+                "total": total_input_tokens,
+                "speech": num_speech_tokens,
+                "text": num_text_tokens,
+                "padding": num_padding_tokens,
+            },
+        }
+
+
+def clip_and_encode_audio(
+    audio_data: np.ndarray,
+    sr: int,
+    start_time: float,
+    end_time: float,
+    segment_idx: int,
+    use_mp3: bool = True,
+    target_sr: int = 16000,  # Downsample to 16kHz for smaller size
+    mp3_bitrate: str = "32k"  # Use low bitrate for minimal transfer
+) -> Tuple[int, Optional[str], Optional[str]]:
+    """
+    Clip audio segment and encode to base64.
+    
+    Args:
+        audio_data: Full audio array
+        sr: Sample rate
+        start_time: Start time in seconds
+        end_time: End time in seconds
+        segment_idx: Segment index for identification
+        use_mp3: Whether to use MP3 format (smaller size)
+        target_sr: Target sample rate for downsampling (lower = smaller)
+        mp3_bitrate: MP3 bitrate (lower = smaller, e.g., "24k", "32k", "48k")
+        
+    Returns:
+        Tuple of (segment_idx, base64_string, error_message)
+    """
+    try:
+        # Convert time to sample indices
+        start_sample = int(start_time * sr)
+        end_sample = int(end_time * sr)
+        
+        # Ensure indices are within bounds
+        start_sample = max(0, start_sample)
+        end_sample = min(len(audio_data), end_sample)
+        
+        if start_sample >= end_sample:
+            return segment_idx, None, f"Invalid time range: [{start_time:.2f}s - {end_time:.2f}s]"
+        
+        # Extract segment
+        segment_data = audio_data[start_sample:end_sample]
+        
+        # Downsample if needed (reduces data size significantly)
+        if sr != target_sr and target_sr < sr:
+            # Simple downsampling using linear interpolation
+            duration = len(segment_data) / sr
+            new_length = int(duration * target_sr)
+            indices = np.linspace(0, len(segment_data) - 1, new_length)
+            segment_data = np.interp(indices, np.arange(len(segment_data)), segment_data)
+            sr = target_sr
+        
+        # Convert float32 audio to int16 for encoding
+        segment_data_int16 = (segment_data * 32768.0).astype(np.int16)
+        
+        # Convert to MP3 if pydub is available and use_mp3 is True
+        if use_mp3 and HAS_PYDUB:
+            try:
+                # Write to WAV in memory
+                wav_buffer = io.BytesIO()
+                sf.write(wav_buffer, segment_data_int16, sr, format='WAV', subtype='PCM_16')
+                wav_buffer.seek(0)
+                
+                # Convert to MP3 with low bitrate
+                audio_segment = AudioSegment.from_wav(wav_buffer)
+                # Convert to mono if stereo (halves the size)
+                if audio_segment.channels > 1:
+                    audio_segment = audio_segment.set_channels(1)
+                mp3_buffer = io.BytesIO()
+                audio_segment.export(mp3_buffer, format='mp3', bitrate=mp3_bitrate)
+                mp3_buffer.seek(0)
+                
+                # Encode to base64
+                audio_bytes = mp3_buffer.read()
+                audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+                audio_src = f"data:audio/mp3;base64,{audio_base64}"
+                
+                return segment_idx, audio_src, None
+            except Exception as e:
+                # Fall back to WAV on error
+                print(f"MP3 conversion failed for segment {segment_idx}, using WAV: {e}")
+        
+        # Fall back to WAV format (no temp file, use in-memory buffer)
+        wav_buffer = io.BytesIO()
+        sf.write(wav_buffer, segment_data_int16, sr, format='WAV', subtype='PCM_16')
+        wav_buffer.seek(0)
+        
+        audio_bytes = wav_buffer.read()
+        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+        audio_src = f"data:audio/wav;base64,{audio_base64}"
+        
+        return segment_idx, audio_src, None
+        
+    except Exception as e:
+        error_msg = f"Error clipping segment {segment_idx}: {str(e)}"
+        print(error_msg)
+        return segment_idx, None, error_msg
+
+
+def extract_audio_segments(audio_path: str, segments: List[Dict]) -> List[Tuple[str, str, Optional[str]]]:
+    """
+    Extract multiple segments from audio file efficiently with parallel processing.
+    
+    Args:
+        audio_path: Path to original audio file
+        segments: List of segment dictionaries with start_time, end_time, etc.
+    
+    Returns:
+        List of tuples (segment_label, audio_base64_src, error_msg)
+    """
+    try:
+        # Read audio file once using ffmpeg for better format support
+        print(f"📂 Loading audio file: {audio_path}")
+        audio_data, sr = load_audio_use_ffmpeg(audio_path, resample=False)
+        print(f"✅ Audio loaded: {len(audio_data)} samples, {sr} Hz")
+        
+        # Prepare tasks
+        tasks = []
+        use_mp3 = HAS_PYDUB  # Use MP3 if available
+        
+        for i, seg in enumerate(segments):
+            start_time = seg.get('start_time')
+            end_time = seg.get('end_time')
+            
+            # Skip if times are not available or invalid
+            if (not isinstance(start_time, (int, float)) or 
+                not isinstance(end_time, (int, float)) or 
+                start_time >= end_time):
+                tasks.append((i, None, None, None, None, None))  # Will be filtered later
+                continue
+            
+            tasks.append((audio_data, sr, start_time, end_time, i, use_mp3))
+        
+        # Process in parallel using ThreadPoolExecutor
+        results = []
+        total_segments = len(tasks)
+        completed_count = 0
+        
+        # Use CPU count for max workers
+        max_workers = os.cpu_count() or 4
+        print(f"🚀 Starting parallel processing with {max_workers} threads...")
+        
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = {}
+            for task in tasks:
+                if task[0] is None:  # Skip invalid tasks
+                    continue
+                future = executor.submit(clip_and_encode_audio, *task)
+                futures[future] = task[4]  # segment_idx
+            
+            for future in as_completed(futures):
+                try:
+                    result = future.result()
+                    results.append(result)
+                    completed_count += 1
+                    # Log progress every 100 segments or at completion
+                    if completed_count % 100 == 0 or completed_count == len(futures):
+                        print(f"Progress: {completed_count}/{len(futures)} segments processed ({completed_count*100//len(futures)}%)")
+                except Exception as e:
+                    idx = futures[future]
+                    results.append((idx, None, f"Processing error: {str(e)}"))
+                    completed_count += 1
+                    print(f"Error on segment {idx}: {e}")
+        
+        print(f"✅ Completed processing all {len(futures)} valid segments")
+        
+        # Sort by segment index to maintain order
+        results.sort(key=lambda x: x[0])
+        
+        # Build output list with labels
+        audio_segments = []
+        for i, (idx, audio_src, error_msg) in enumerate(results):
+            seg = segments[idx] if idx < len(segments) else {}
+            start_time = seg.get('start_time', 'N/A')
+            end_time = seg.get('end_time', 'N/A')
+            speaker_id = seg.get('speaker_id', 'N/A')
+            
+            segment_label = f"Segment {idx+1}: [{start_time:.2f}s - {end_time:.2f}s] Speaker {speaker_id}"
+            audio_segments.append((segment_label, audio_src, error_msg))
+        
+        return audio_segments
+        
+    except Exception as e:
+        print(f"Error loading audio file: {e}")
+        return []
+
+
+# Global variable to store the ASR model
+asr_model = None
+
+# Global stop flag for generation
+stop_generation_flag = False
+
+
+class StopOnFlag(StoppingCriteria):
+    """Custom stopping criteria that checks a global flag."""
+    def __call__(self, input_ids, scores, **kwargs):
+        global stop_generation_flag
+        return stop_generation_flag
+
+
+def parse_time_to_seconds(val: Optional[str]) -> Optional[float]:
+    """Parse seconds or hh:mm:ss to float seconds."""
+    if val is None:
+        return None
+    val = val.strip()
+    if not val:
+        return None
+    try:
+        return float(val)
+    except ValueError:
+        pass
+    if ":" in val:
+        parts = val.split(":")
+        if not all(p.strip().replace(".", "", 1).isdigit() for p in parts):
+            return None
+        parts = [float(p) for p in parts]
+        if len(parts) == 3:
+            h, m, s = parts
+        elif len(parts) == 2:
+            h = 0
+            m, s = parts
+        else:
+            return None
+        return h * 3600 + m * 60 + s
+    return None
+
+
+def slice_audio_to_temp(
+    audio_data: np.ndarray,
+    sample_rate: int,
+    start_sec: Optional[float],
+    end_sec: Optional[float]
+) -> Tuple[Optional[str], Optional[str]]:
+    """Slice audio_data to [start_sec, end_sec) and write to a temp WAV file."""
+    n_samples = len(audio_data)
+    full_duration = n_samples / float(sample_rate)
+    start = 0.0 if start_sec is None else max(0.0, start_sec)
+    end = full_duration if end_sec is None else min(full_duration, end_sec)
+    if end <= start:
+        return None, f"Invalid time range: start={start:.2f}s, end={end:.2f}s"
+    start_idx = int(start * sample_rate)
+    end_idx = int(end * sample_rate)
+    segment = audio_data[start_idx:end_idx]
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    temp_file.close()
+    segment_int16 = (segment * 32768.0).astype(np.int16)
+    sf.write(temp_file.name, segment_int16, sample_rate, subtype='PCM_16')
+    return temp_file.name, None
+
+
+def initialize_model(model_path: str, device: str = "cuda", attn_implementation: str = "flash_attention_2"):
+    """Initialize the ASR model."""
+    global asr_model
+    try:
+        dtype = torch.bfloat16 if device != "cpu" else torch.float32
+        asr_model = VibeVoiceASRInference(
+            model_path=model_path,
+            device=device,
+            dtype=dtype,
+            attn_implementation=attn_implementation
+        )
+        return f"✅ Model loaded successfully from {model_path}"
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return f"❌ Error loading model: {str(e)}"
+
+
+def transcribe_audio(
+    audio_input,
+    audio_path_input: str,
+    start_time_input: str,
+    end_time_input: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    do_sample: bool,
+    repetition_penalty: float = 1.0,
+    context_info: str = ""
+) -> Generator[Tuple[str, str], None, None]:
+    """
+    Transcribe audio and return results with audio segments (streaming version).
+    
+    Args:
+        audio_input: Audio file path or tuple (sample_rate, audio_data)
+        max_new_tokens: Maximum tokens to generate
+        temperature: Temperature for sampling (0 for greedy)
+        top_p: Top-p for nucleus sampling
+        do_sample: Whether to use sampling
+        context_info: Optional context information (e.g., hotwords, speaker names, topics)
+    
+    Yields:
+        Tuple of (raw_text, audio_segments_html)
+    """
+    if asr_model is None:
+        yield "❌ Please load a model first!", ""
+        return
+    
+    if not audio_path_input and audio_input is None:
+        yield "❌ Please provide audio input!", ""
+        return
+    
+    try:
+        print("[INFO] Transcription requested")
+        start_sec = parse_time_to_seconds(start_time_input)
+        end_sec = parse_time_to_seconds(end_time_input)
+        print(f"[INFO] Parsed time range: start={start_sec}, end={end_sec}")
+        if (start_time_input and start_sec is None) or (end_time_input and end_sec is None):
+            yield "❌ Invalid time format. Use seconds or hh:mm:ss.", ""
+            return
+
+        audio_path = None
+        audio_array = None
+        sample_rate = None
+
+        if audio_path_input:
+            candidate = Path(audio_path_input.strip())
+            if not candidate.exists():
+                yield f"❌ Provided path does not exist: {candidate}", ""
+                return
+            audio_path = str(candidate)
+            print(f"[INFO] Using provided audio path: {audio_path}")
+        # Get audio file path (Gradio Audio component returns tuple (sample_rate, audio_data) or file path)
+        elif isinstance(audio_input, str):
+            audio_path = audio_input
+            print(f"[INFO] Using uploaded audio path: {audio_path}")
+        elif isinstance(audio_input, tuple):
+            # Audio from microphone: (sample_rate, audio_data)
+            sample_rate, audio_array = audio_input
+            print(f"[INFO] Received microphone audio with sample_rate={sample_rate}")
+        elif audio_path is None:
+            yield "❌ Invalid audio input format!", ""
+            return
+
+        # If slicing is requested, load and slice audio
+        if start_sec is not None or end_sec is not None:
+            print("[INFO] Slicing audio per requested time range")
+            if audio_array is None or sample_rate is None:
+                try:
+                    audio_array, sample_rate = load_audio_use_ffmpeg(audio_path, resample=False)
+                    print("[INFO] Loaded audio for slicing via ffmpeg")
+                except Exception as exc:
+                    yield f"❌ Failed to load audio for slicing: {exc}", ""
+                    return
+            sliced_path, err = slice_audio_to_temp(audio_array, sample_rate, start_sec, end_sec)
+            if err:
+                yield f"❌ {err}", ""
+                return
+            audio_path = sliced_path
+            print(f"[INFO] Sliced audio written to temp file: {audio_path}")
+        elif audio_array is not None and sample_rate is not None:
+            # no slicing but microphone input: write to temp file
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+            audio_path = temp_file.name
+            temp_file.close()
+            audio_data_int16 = (audio_array * 32768.0).astype(np.int16)
+            sf.write(audio_path, audio_data_int16, sample_rate, subtype='PCM_16')
+            print(f"[INFO] Microphone audio saved to temp file: {audio_path}")
+        
+        # Create streamer for real-time output
+        streamer = TextIteratorStreamer(
+            asr_model.processor.tokenizer, 
+            skip_prompt=True, 
+            skip_special_tokens=True
+        )
+        
+        # Store result in a mutable container for the thread
+        result_container = {"result": None, "error": None}
+        
+        def run_transcription():
+            try:
+                result_container["result"] = asr_model.transcribe(
+                    audio_path=audio_path,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=do_sample,
+                    repetition_penalty=repetition_penalty,
+                    context_info=context_info if context_info and context_info.strip() else None,
+                    streamer=streamer
+                )
+            except Exception as e:
+                result_container["error"] = str(e)
+                traceback.print_exc()
+        
+        # Start transcription in background thread
+        print("[INFO] Starting model transcription (streaming mode)")
+        start_time = time.time()
+        transcription_thread = threading.Thread(target=run_transcription)
+        transcription_thread.start()
+        
+        # Yield streaming output
+        generated_text = ""
+        token_count = 0
+        for new_text in streamer:
+            generated_text += new_text
+            token_count += 1
+            elapsed = time.time() - start_time
+            # Show streaming output with live stats, format for readability
+            formatted_text = generated_text.replace('},', '},\n')
+            streaming_output = f"--- 🔴 LIVE Streaming Output (tokens: {token_count}, time: {elapsed:.1f}s) ---\n{formatted_text}"
+            yield streaming_output, "<div style='padding: 20px; text-align: center; color: #6c757d;'>⏳ Generating transcription... Audio segments will appear after completion.</div>"
+        
+        # Wait for thread to complete
+        transcription_thread.join()
+        
+        if result_container["error"]:
+            yield f"❌ Error during transcription: {result_container['error']}", ""
+            return
+        
+        result = result_container["result"]
+        generation_time = time.time() - start_time
+        
+        # Get input token statistics
+        input_tokens = result.get('input_tokens', {})
+        speech_tokens = input_tokens.get('speech', 0)
+        text_tokens = input_tokens.get('text', 0)
+        padding_tokens = input_tokens.get('padding', 0)
+        total_input = input_tokens.get('total', 0)
+        
+        # Format final raw output with input/output token stats
+        raw_output = f"--- ✅ Raw Output ---\n"
+        raw_output += f"📥 Input: {total_input} tokens (🎤 speech: {speech_tokens}, 📝 text: {text_tokens}, ⬜ pad: {padding_tokens})\n"
+        raw_output += f"📤 Output: {token_count} tokens | ⏱️ Time: {generation_time:.2f}s\n"
+        raw_output += f"---\n"
+        # Format raw text for better readability: add newline after each dict (},)
+        formatted_raw_text = result['raw_text'].replace('},', '},\n')
+        raw_output += formatted_raw_text
+        
+        # Debug: print raw output to console
+        print(f"[DEBUG] Raw model output:")
+        print(f"[DEBUG] {result['raw_text']}")
+        print(f"[DEBUG] Found {len(result['segments'])} segments")
+        
+        # Create audio segments with server-side encoding (low quality for minimal transfer)
+        # Using: 16kHz mono MP3 @ 32kbps = ~4KB per second of audio
+        audio_segments_html = ""
+        segments = result['segments']
+        
+        if segments:
+            num_segments = len(segments)
+            print(f"[INFO] Creating per-segment audio clips ({num_segments} segments, 16kHz mono MP3 @ 32kbps)")
+            
+            # Extract all audio segments efficiently (load audio only once)
+            audio_segments = extract_audio_segments(audio_path, segments)
+            print("[INFO] Completed creating audio clips")
+            
+            # Calculate approximate total size
+            total_duration = sum(
+                (seg.get('end_time', 0) - seg.get('start_time', 0)) 
+                for seg in segments 
+                if isinstance(seg.get('start_time'), (int, float)) and isinstance(seg.get('end_time'), (int, float))
+            )
+            approx_size_kb = total_duration * 4  # ~4KB per second at 32kbps
+            
+            # Add CSS for theme-aware styling
+            theme_css = """
+            <style>
+            :root {
+                --segment-bg: #f8f9fa;
+                --segment-border: #e1e5e9;
+                --segment-text: #495057;
+                --segment-meta: #6c757d;
+                --content-bg: white;
+                --content-border: #007bff;
+                --warning-bg: #fff3cd;
+                --warning-border: #ffc107;
+                --warning-text: #856404;
+            }
+            
+            @media (prefers-color-scheme: dark) {
+                :root {
+                    --segment-bg: #2d3748;
+                    --segment-border: #4a5568;
+                    --segment-text: #e2e8f0;
+                    --segment-meta: #a0aec0;
+                    --content-bg: #1a202c;
+                    --content-border: #4299e1;
+                    --warning-bg: #744210;
+                    --warning-border: #d69e2e;
+                    --warning-text: #faf089;
+                }
+            }
+            
+            .audio-segments-container {
+                max-height: 600px;
+                overflow-y: auto;
+                padding: 10px;
+            }
+            
+            .audio-segment {
+                margin-bottom: 15px;
+                padding: 15px;
+                border: 2px solid var(--segment-border);
+                border-radius: 8px;
+                background-color: var(--segment-bg);
+                transition: all 0.3s ease;
+            }
+            
+            .audio-segment:hover {
+                box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
+            }
+            
+            .segment-header {
+                margin-bottom: 10px;
+            }
+            
+            .segment-title {
+                margin: 0;
+                color: var(--segment-text);
+                font-size: 16px;
+                font-weight: 600;
+            }
+            
+            .segment-meta {
+                margin-top: 5px;
+                font-size: 14px;
+                color: var(--segment-meta);
+            }
+            
+            .segment-content {
+                margin-bottom: 10px;
+                padding: 12px;
+                background-color: var(--content-bg);
+                border-radius: 6px;
+                border-left: 4px solid var(--content-border);
+                color: var(--segment-text);
+                line-height: 1.5;
+            }
+            
+            .segment-audio {
+                width: 100%;
+                margin-top: 10px;
+                border-radius: 4px;
+            }
+            
+            .segment-warning {
+                margin-top: 10px;
+                padding: 10px;
+                background-color: var(--warning-bg);
+                border-radius: 4px;
+                border-left: 4px solid var(--warning-border);
+                color: var(--warning-text);
+                font-size: 13px;
+            }
+            
+            .segments-title {
+                color: var(--segment-text);
+                margin-bottom: 10px;
+            }
+            
+            .segments-description {
+                color: var(--segment-meta);
+                margin-bottom: 20px;
+            }
+            
+            .size-badge {
+                display: inline-block;
+                background: linear-gradient(135deg, #6c757d, #495057);
+                color: white;
+                padding: 4px 10px;
+                border-radius: 12px;
+                font-size: 12px;
+                margin-left: 10px;
+            }
+            </style>
+            """
+            
+            audio_segments_html = theme_css
+            audio_segments_html += f"<div class='audio-segments-container'>"
+            
+            # Add format info
+            format_info = "MP3 32kbps 16kHz mono" if HAS_PYDUB else "WAV 16kHz"
+            audio_segments_html += f"<h3 class='segments-title'>🔊 Audio Segments ({num_segments} segments)"
+            audio_segments_html += f"<span class='size-badge'>📦 ~{approx_size_kb:.0f}KB ({format_info})</span></h3>"
+            audio_segments_html += "<p class='segments-description'>🎵 Click the play button to listen to each segment directly!</p>"
+            
+            for i, (label, audio_src, error_msg) in enumerate(audio_segments):
+                seg = segments[i] if i < len(segments) else {}
+                start_time = seg.get('start_time', 'N/A')
+                end_time = seg.get('end_time', 'N/A')
+                speaker_id = seg.get('speaker_id', 'N/A')
+                content = seg.get('text', '')
+                
+                # Format times nicely
+                start_str = f"{start_time:.2f}" if isinstance(start_time, (int, float)) else str(start_time)
+                end_str = f"{end_time:.2f}" if isinstance(end_time, (int, float)) else str(end_time)
+                
+                audio_segments_html += f"""
+                <div class='audio-segment'>
+                    <div class='segment-header'>
+                        <h4 class='segment-title'>Segment {i+1}</h4>
+                        <div class='segment-meta'>
+                            <strong>Time:</strong> [{start_str}s - {end_str}s] | 
+                            <strong>Speaker:</strong> {speaker_id}
+                        </div>
+                    </div>
+                    
+                    <div class='segment-content'>
+                        {content}
+                    </div>
+                """
+                
+                if audio_src:
+                    # Detect format from data URI
+                    audio_type = 'audio/mp3' if 'audio/mp3' in audio_src else 'audio/wav'
+                    audio_segments_html += f"""
+                    <audio controls class='segment-audio' preload='none'>
+                        <source src='{audio_src}' type='{audio_type}'>
+                        Your browser does not support the audio element.
+                    </audio>
+                    """
+                elif error_msg:
+                    audio_segments_html += f"""
+                    <div class='segment-warning'>
+                        <small>❌ {error_msg}</small>
+                    </div>
+                    """
+                else:
+                    audio_segments_html += """
+                    <div class='segment-warning'>
+                        <small>Audio playback unavailable for this segment</small>
+                    </div>
+                    """
+                
+                audio_segments_html += "</div>"
+            
+            audio_segments_html += "</div>"
+        else:
+            audio_segments_html = """
+            <style>
+            :root {
+                --no-segments-text: #6c757d;
+            }
+            
+            @media (prefers-color-scheme: dark) {
+                :root {
+                    --no-segments-text: #a0aec0;
+                }
+            }
+            
+            .no-segments-container {
+                padding: 20px;
+                text-align: center;
+                color: var(--no-segments-text);
+                line-height: 1.6;
+            }
+            </style>
+            <div class='no-segments-container'>
+                <p>❌ No audio segments available.</p>
+                <p>This could happen if the model output doesn't contain valid time stamps.</p>
+            </div>
+            """
+        
+        # Final yield with complete results
+        yield raw_output, audio_segments_html
+        
+    except Exception as e:
+        print(f"Error during transcription: {e}")
+        print(traceback.format_exc())
+        yield f"❌ Error during transcription: {str(e)}", ""
+
+
+def create_gradio_interface(model_path: str, default_max_tokens: int = 8192, attn_implementation: str = "flash_attention_2"):
+    """Create and launch Gradio interface.
+    
+    Args:
+        model_path: Path to the model (HuggingFace format directory or model name)
+        default_max_tokens: Default value for max_new_tokens slider
+        attn_implementation: Attention implementation to use ('flash_attention_2', 'sdpa', 'eager')
+    """
+    
+    # Initialize model at startup
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model_status = initialize_model(model_path, device, attn_implementation)
+    print(model_status)
+    
+    # Exit if model loading failed
+    if model_status.startswith("❌"):
+        print("\n" + "="*80)
+        print("💥 FATAL ERROR: Model loading failed!")
+        print("="*80)
+        print("Cannot start demo without a valid model. Please check:")
+        print("  1. Model path is correct")
+        print("  2. Model files are not corrupted")
+        print("  3. You have enough GPU memory")
+        print("  4. CUDA is properly installed (if using GPU)")
+        print("="*80)
+        sys.exit(1)
+    
+    # Custom CSS for Stop button styling
+    custom_css = """
+    #stop-btn {
+        background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%) !important;
+        border: none !important;
+        color: white !important;
+    }
+    #stop-btn:hover {
+        background: linear-gradient(135deg, #dc2626 0%, #b91c1c 100%) !important;
+    }
+    """
+    
+    # Gradio 6.0+ moved theme/css to launch()
+    with gr.Blocks(title="VibeVoice ASR Demo") as demo:
+        gr.Markdown("# 🎙️ VibeVoice ASR Demo")
+        gr.Markdown("Upload audio files or record from microphone to get speech-to-text transcription with speaker diarization.")
+        gr.Markdown(f"**Model loaded from:** `{model_path}`")
+        
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Generation parameters
+                gr.Markdown("## ⚙️ Generation Parameters")
+                max_tokens_slider = gr.Slider(
+                    minimum=4096,
+                    maximum=65536,
+                    value=default_max_tokens,
+                    step=4096,
+                    label="Max New Tokens"
+                )
+                
+                # Sampling parameters
+                gr.Markdown("### 🎲 Sampling")
+                do_sample_checkbox = gr.Checkbox(
+                    value=False,
+                    label="Enable Sampling",
+                    info="Enable random sampling instead of deterministic decoding"
+                )
+                
+                with gr.Column(visible=False) as sampling_params:
+                    temperature_slider = gr.Slider(
+                        minimum=0.0,
+                        maximum=2.0,
+                        value=0.0,
+                        step=0.1,
+                        label="Temperature",
+                        info="0 = greedy, higher = more random"
+                    )
+                    top_p_slider = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=1.0,
+                        step=0.05,
+                        label="Top-p (Nucleus Sampling)",
+                        info="1.0 = no filtering"
+                    )
+                
+                # Repetition penalty (works with both greedy and sampling)
+                repetition_penalty_slider = gr.Slider(
+                    minimum=1.0,
+                    maximum=1.2,
+                    value=1.0,
+                    step=0.01,
+                    label="Repetition Penalty",
+                    info="1.0 = no penalty, higher = less repetition (works with greedy & sampling)"
+                )
+                
+                # Context information section
+                gr.Markdown("## 📋 Context Info (Optional)")
+                context_info_input = gr.Textbox(
+                    label="Context Information",
+                    placeholder="Enter hotwords, speaker names, topics, or other context to help transcription...\nExample:\nJohn Smith\nMachine Learning\nOpenAI",
+                    lines=4,
+                    max_lines=8,
+                    interactive=True,
+                    info="Provide context like proper nouns, technical terms, or speaker names to improve accuracy"
+                )
+            
+            with gr.Column(scale=2):
+                # Audio input section
+                gr.Markdown("## 🎵 Audio Input")
+                audio_input = gr.Audio(
+                    label="Upload Audio File or Record from Microphone",
+                    sources=["upload", "microphone"],
+                    type="filepath",
+                    interactive=True,
+                    buttons=["download"]
+                )
+
+                # Video file upload (MKV, MP4, AVI, etc.) - audio will be extracted
+                video_file_input = gr.File(
+                    label="📹 Or Upload Video File (MKV, MP4, AVI, etc.)",
+                    file_types=[".mkv", ".mp4", ".avi", ".mov", ".webm", ".flv", ".m4v", ".mpeg", ".3gp"],
+                    type="filepath"
+                )
+
+                with gr.Accordion("📂 Advanced: Remote Path & Time Slicing", open=False):
+                    audio_path_input = gr.Textbox(
+                        label="Audio path (optional)",
+                        placeholder="Enter remote audio file path",
+                        lines=1
+                    )
+                    with gr.Row():
+                        start_time_input = gr.Textbox(
+                            label="Start time",
+                            placeholder="e.g., 0 or 00:00:00",
+                            lines=1,
+                            info="Leave empty to start from the beginning"
+                        )
+                        end_time_input = gr.Textbox(
+                            label="End time",
+                            placeholder="e.g., 30.5 or 00:00:30.5",
+                            lines=1,
+                            info="Leave empty to use full length"
+                        )
+                
+                with gr.Row():
+                    transcribe_button = gr.Button("🎯 Transcribe", variant="primary", size="lg", scale=3)
+                    stop_button = gr.Button("⏹️ Stop", variant="secondary", size="lg", scale=1, elem_id="stop-btn")
+                
+                # Results section
+                gr.Markdown("## 📝 Results")
+                
+                with gr.Tabs():
+                    with gr.TabItem("Raw Output"):
+                        raw_output = gr.Textbox(
+                            label="Raw Transcription Output",
+                            lines=8,
+                            max_lines=20,
+                            interactive=False
+                        )
+                    
+                    with gr.TabItem("Audio Segments"):
+                        audio_segments_output = gr.HTML(
+                            label="Play individual segments to verify accuracy"
+                        )
+        
+        # Event handlers
+        do_sample_checkbox.change(
+            fn=lambda x: gr.update(visible=x),
+            inputs=[do_sample_checkbox],
+            outputs=[sampling_params]
+        )
+        
+        def reset_stop_flag():
+            """Reset stop flag before starting transcription."""
+            global stop_generation_flag
+            stop_generation_flag = False
+
+        def transcribe_with_video_support(
+            audio_input,
+            video_file_input,
+            audio_path_input,
+            start_time_input,
+            end_time_input,
+            max_new_tokens,
+            temperature,
+            top_p,
+            do_sample,
+            repetition_penalty,
+            context_info
+        ):
+            """Wrapper that handles both audio and video file inputs."""
+            # If video file is provided, use it as the audio path
+            effective_audio_path = audio_path_input
+            if video_file_input is not None and not audio_path_input:
+                effective_audio_path = video_file_input
+                print(f"[INFO] Using video file: {effective_audio_path}")
+
+            # Call original transcribe function
+            yield from transcribe_audio(
+                audio_input,
+                effective_audio_path,
+                start_time_input,
+                end_time_input,
+                max_new_tokens,
+                temperature,
+                top_p,
+                do_sample,
+                repetition_penalty,
+                context_info
+            )
+
+        def set_stop_flag():
+            """Set stop flag to interrupt generation."""
+            global stop_generation_flag
+            stop_generation_flag = True
+            return "⏹️ Stop requested..."
+        
+        transcribe_button.click(
+            fn=reset_stop_flag,
+            inputs=[],
+            outputs=[],
+            queue=False
+        ).then(
+            fn=transcribe_with_video_support,
+            inputs=[
+                audio_input,
+                video_file_input,
+                audio_path_input,
+                start_time_input,
+                end_time_input,
+                max_tokens_slider,
+                temperature_slider,
+                top_p_slider,
+                do_sample_checkbox,
+                repetition_penalty_slider,
+                context_info_input
+            ],
+            outputs=[raw_output, audio_segments_output]
+        )
+
+        stop_button.click(
+            fn=set_stop_flag,
+            inputs=[],
+            outputs=[raw_output],
+            queue=False
+        )
+        
+        # Add examples
+        gr.Markdown("## 📋 Instructions")
+        gr.Markdown(f"""
+        1. **Upload Audio**: Use the audio component to upload a file or record from microphone
+           - **Supported formats**: {', '.join(sorted(set([ext.lower() for ext in COMMON_AUDIO_EXTS])))}
+           - Optionally set **Start/End time** (seconds or hh:mm:ss) to clip before transcription
+        2. **Context Info (Optional)**: Provide context to improve transcription accuracy
+           - Add hotwords, proper nouns, speaker names, or technical terms
+           - One item per line or comma-separated
+           - Examples: "John Smith", "OpenAI", "machine learning"
+        3. **Adjust Parameters**: Configure generation parameters as needed
+        4. **Transcribe**: Click "Transcribe" to get results
+        5. **Review Results**: 
+           - **Raw Output**: View the model's original output
+           - **Audio Segments**: Play individual segments directly to verify accuracy
+        
+        **Audio Segments**: Each segment shows the time range, speaker ID, transcribed content, and an embedded audio player for immediate verification.
+        """)
+    
+    return demo, custom_css
+
+
+def main():
+    parser = argparse.ArgumentParser(description="VibeVoice ASR Gradio Demo")
+    parser.add_argument(
+        "--model_path", 
+        type=str, 
+        default="microsoft/VibeVoice-ASR",
+        help="Path to the model (HuggingFace format directory or model name)"
+    )
+    parser.add_argument(
+        "--attn_implementation",
+        type=str,
+        default="flash_attention_2",
+        help="Attention implementation to use (default: flash_attention_2)"
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=32768,
+        help="Default max new tokens for generation (default: 32768)"
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="0.0.0.0",
+        help="Host to bind the server to"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="Port to bind the server to"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a public link"
+    )
+    
+    args = parser.parse_args()
+    
+    # Create and launch interface
+    demo, custom_css = create_gradio_interface(
+        model_path=args.model_path,
+        default_max_tokens=args.max_new_tokens,
+        attn_implementation=args.attn_implementation
+    )
+    
+    print(f"🚀 Starting VibeVoice ASR Demo...")
+    print(f"📍 Server will be available at: http://{args.host}:{args.port}")
+    
+    # Gradio 6.0+ moved theme/css to launch()
+    launch_kwargs = {
+        "server_name": args.host,
+        "server_port": args.port,
+        "share": args.share,
+        "show_error": True,
+        "theme": gr.themes.Soft(),
+        "css": custom_css,
+    }
+    
+    # Enable queue for concurrent request handling
+    demo.queue(default_concurrency_limit=3)
+    demo.launch(**launch_kwargs)
+
+
+if __name__ == "__main__":
+    main()