From 4cc14bc0a968531fa09089371f0a8e3cfa000962 Mon Sep 17 00:00:00 2001
From: Scott Register <sreg@sreg.io>
Date: Sun, 27 Jul 2025 08:34:57 -0700
Subject: [PATCH] nvenc

---
 config-streaming-runpod.yaml      |  8 ++---
 runpod_setup.sh                   |  3 +-
 vr180_streaming/frame_writer.py   | 58 ++++++++++++++++++++++++++++---
 vr180_streaming/sam2_streaming.py |  5 +--
 4 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/config-streaming-runpod.yaml b/config-streaming-runpod.yaml
index aead26d..768ed4d 100644
--- a/config-streaming-runpod.yaml
+++ b/config-streaming-runpod.yaml
@@ -27,7 +27,7 @@ matting:
   sam2_model_cfg: "sam2.1_hiera_l"  # Use large model for best quality
   sam2_checkpoint: "segment-anything-2/checkpoints/sam2.1_hiera_large.pt"
   memory_offload: true  # Critical for streaming - offload to CPU when needed
-  fp16: true  # Use half precision for memory efficiency
+  fp16: false  # Disable FP16 to avoid type mismatch with compiled models for memory efficiency
   continuous_correction: true  # Periodically refine tracking
   correction_interval: 300  # Correct every 5 seconds at 60fps
 
@@ -43,14 +43,14 @@ output:
   path: "/workspace/output_video.mp4"  # Update with your output path
   format: "greenscreen"  # "greenscreen" or "alpha"
   background_color: [0, 255, 0]  # RGB for green screen
-  video_codec: "libx264"  # CPU encoding (use "h264_nvenc" if GPU encoding works)
-  quality_preset: "medium"  # CPU preset (ultrafast/fast/medium/slow/veryslow)
+  video_codec: "h264_nvenc"  # GPU encoding for L40 (fallback to CPU if not available)
+  quality_preset: "p4"  # NVENC preset (p1=fastest, p7=slowest/best quality)
   crf: 18  # Quality (0-51, lower = better, 18 = high quality)
   maintain_sbs: true  # Keep side-by-side format with audio
 
 hardware:
   device: "cuda"
-  max_vram_gb: 40.0  # Conservative limit for 48GB GPU
+  max_vram_gb: 44.0  # Conservative limit for L40 48GB VRAM
   max_ram_gb: 48.0  # RunPod container RAM limit
 
 recovery:
diff --git a/runpod_setup.sh b/runpod_setup.sh
index d4e757b..34b635d 100755
--- a/runpod_setup.sh
+++ b/runpod_setup.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # VR180 Matting Unified Setup Script for RunPod
 # Supports both chunked and streaming implementations
+# Optimized for L40, A6000, and other NVENC-capable GPUs
 
 set -e  # Exit on error
 
@@ -235,7 +236,7 @@ echo "==================="
 echo "- Streaming: Best for long videos, uses ~50GB RAM constant"
 echo "- Chunked: More stable but uses 100GB+ RAM in spikes"
 echo "- Scale factor: 0.25 (fast) → 0.5 (balanced) → 1.0 (quality)"
-echo "- A6000/A100: Can handle 0.5-0.75 scale easily"
+echo "- L40/A6000: Can handle 0.5-0.75 scale easily with NVENC GPU encoding"
 echo "- Monitor VRAM with: nvidia-smi -l 1"
 echo
 echo "🎯 Example Commands:"
diff --git a/vr180_streaming/frame_writer.py b/vr180_streaming/frame_writer.py
index f32d3de..f33db1f 100644
--- a/vr180_streaming/frame_writer.py
+++ b/vr180_streaming/frame_writer.py
@@ -11,6 +11,28 @@ import atexit
 import warnings
 
 
+def test_nvenc_support() -> bool:
+    """Test if NVENC encoding is available"""
+    try:
+        # Quick test with a 1-frame video
+        cmd = [
+            'ffmpeg', '-f', 'lavfi', '-i', 'testsrc=duration=0.1:size=320x240:rate=1',
+            '-c:v', 'h264_nvenc', '-t', '0.1', '-f', 'null', '-'
+        ]
+        
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            timeout=10,
+            text=True
+        )
+        
+        return result.returncode == 0
+        
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return False
+
+
 class StreamingFrameWriter:
     """Write frames directly to ffmpeg via pipe for memory-efficient output"""
     
@@ -36,6 +58,16 @@ class StreamingFrameWriter:
         self.frames_written = 0
         self.ffmpeg_process = None
         
+        # Test NVENC support if GPU codec requested
+        if video_codec in ['h264_nvenc', 'hevc_nvenc']:
+            print(f"🔍 Testing NVENC support...")
+            if not test_nvenc_support():
+                print(f"❌ NVENC not available, switching to CPU encoding")
+                video_codec = 'libx264'
+                quality_preset = 'medium'
+            else:
+                print(f"✅ NVENC available")
+        
         # Build ffmpeg command
         self.ffmpeg_cmd = self._build_ffmpeg_command(
             video_codec, quality_preset, crf
@@ -134,23 +166,39 @@ class StreamingFrameWriter:
             
             # Test if ffmpeg starts successfully (quick check)
             import time
-            time.sleep(0.1)  # Give ffmpeg time to fail if it's going to
+            time.sleep(0.2)  # Give ffmpeg time to fail if it's going to
             
             if self.ffmpeg_process.poll() is not None:
                 # Process already died - read error
                 stderr = self.ffmpeg_process.stderr.read().decode()
-                raise RuntimeError(f"FFmpeg failed immediately: {stderr}")
+                
+                # Check for specific NVENC errors and provide better feedback
+                if 'nvenc' in ' '.join(self.ffmpeg_cmd):
+                    if 'unsupported device' in stderr.lower():
+                        print(f"❌ NVENC not available on this GPU - switching to CPU encoding")
+                    elif 'cannot load' in stderr.lower() or 'not found' in stderr.lower():
+                        print(f"❌ NVENC drivers not available - switching to CPU encoding")
+                    else:
+                        print(f"❌ NVENC encoding failed: {stderr}")
+                        
+                    # Try CPU fallback
+                    print(f"🔄 Falling back to CPU encoding (libx264)...")
+                    self.ffmpeg_cmd = self._build_ffmpeg_command('libx264', 'medium', 18)
+                    return self._start_ffmpeg()
+                else:
+                    raise RuntimeError(f"FFmpeg failed: {stderr}")
             
             # Set process to ignore SIGINT (Ctrl+C) - we'll handle it
             if hasattr(signal, 'pthread_sigmask'):
                 signal.pthread_sigmask(signal.SIG_BLOCK, [signal.SIGINT])
                 
         except Exception as e:
-            # Try CPU fallback if GPU encoding fails
+            # Final fallback if everything fails
             if 'nvenc' in ' '.join(self.ffmpeg_cmd):
-                print(f"⚠️  GPU encoding failed, trying CPU fallback...")
+                print(f"⚠️  GPU encoding failed with error: {e}")
+                print(f"🔄 Falling back to CPU encoding...")
                 self.ffmpeg_cmd = self._build_ffmpeg_command('libx264', 'medium', 18)
-                self._start_ffmpeg()
+                return self._start_ffmpeg()
             else:
                 raise RuntimeError(f"Failed to start ffmpeg: {e}")
                 
diff --git a/vr180_streaming/sam2_streaming.py b/vr180_streaming/sam2_streaming.py
index 28d907b..2216c5f 100644
--- a/vr180_streaming/sam2_streaming.py
+++ b/vr180_streaming/sam2_streaming.py
@@ -83,9 +83,10 @@ class SAM2StreamingProcessor:
             # Set to eval mode
             self.predictor.eval()
             
-            # Enable FP16 if requested
+            # Note: FP16 conversion can cause type mismatches with compiled models
+            # Let SAM2 handle precision internally via build_sam2_video_predictor options
             if self.fp16 and self.device.type == 'cuda':
-                self.predictor = self.predictor.half()
+                print("   FP16 enabled via SAM2 internal settings")
                 
         except Exception as e:
             raise RuntimeError(f"Failed to initialize SAM2 predictor: {e}")