From 725a7814560dcf88b1d03f421ca8fe348f522baa Mon Sep 17 00:00:00 2001
From: Scott Register <sreg@sreg.io>
Date: Sat, 26 Jul 2025 12:29:32 -0700
Subject: [PATCH] cupy

---
 requirements.txt                 |   5 +-
 runpod_setup.sh                  |  22 +++++
 vr180_matting/vr180_processor.py | 159 +++++++++++++++++++++++++++----
 3 files changed, 165 insertions(+), 21 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5ef24af..70f6064 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,7 @@ ultralytics>=8.0.0
 tqdm>=4.65.0
 psutil>=5.9.0
 ffmpeg-python>=0.2.0
-decord>=0.6.0
\ No newline at end of file
+decord>=0.6.0
+# GPU acceleration (optional but recommended for stereo validation speedup)
+# cupy-cuda11x>=12.0.0  # For CUDA 11.x
+# cupy-cuda12x>=12.0.0  # For CUDA 12.x - uncomment appropriate version
\ No newline at end of file
diff --git a/runpod_setup.sh b/runpod_setup.sh
index 5aa992c..046ddc0 100755
--- a/runpod_setup.sh
+++ b/runpod_setup.sh
@@ -18,6 +18,28 @@ pip install -r requirements.txt
 echo "📹 Installing decord for video processing..."
 pip install decord
 
+# Install CuPy for GPU acceleration of stereo validation
+echo "🚀 Installing CuPy for GPU acceleration..."
+# Auto-detect CUDA version and install appropriate CuPy
+python -c "
+import torch
+if torch.cuda.is_available():
+    cuda_version = torch.version.cuda
+    print(f'CUDA version detected: {cuda_version}')
+    if cuda_version.startswith('11.'):
+        import subprocess
+        subprocess.run(['pip', 'install', 'cupy-cuda11x>=12.0.0'])
+        print('Installed CuPy for CUDA 11.x')
+    elif cuda_version.startswith('12.'):
+        import subprocess
+        subprocess.run(['pip', 'install', 'cupy-cuda12x>=12.0.0'])
+        print('Installed CuPy for CUDA 12.x')
+    else:
+        print(f'Unsupported CUDA version: {cuda_version}')
+else:
+    print('CUDA not available, skipping CuPy installation')
+"
+
 # Install SAM2 separately (not on PyPI)
 echo "🎯 Installing SAM2..."
 pip install git+https://github.com/facebookresearch/segment-anything-2.git
diff --git a/vr180_matting/vr180_processor.py b/vr180_matting/vr180_processor.py
index 9c375ae..145ad30 100644
--- a/vr180_matting/vr180_processor.py
+++ b/vr180_matting/vr180_processor.py
@@ -89,7 +89,7 @@ class VR180Processor(VideoProcessor):
     
     def combine_sbs_frame(self, left_eye: np.ndarray, right_eye: np.ndarray) -> np.ndarray:
         """
-        Combine left and right eye frames back into side-by-side format
+        Combine left and right eye frames back into side-by-side format with GPU acceleration
         
         Args:
             left_eye: Left eye frame
@@ -98,15 +98,39 @@ class VR180Processor(VideoProcessor):
         Returns:
             Combined SBS frame
         """
-        # Ensure frames have same height
-        if left_eye.shape[0] != right_eye.shape[0]:
-            target_height = min(left_eye.shape[0], right_eye.shape[0])
-            left_eye = cv2.resize(left_eye, (left_eye.shape[1], target_height))
-            right_eye = cv2.resize(right_eye, (right_eye.shape[1], target_height))
-        
-        # Combine horizontally
-        combined = np.hstack([left_eye, right_eye])
-        return combined
+        try:
+            import cupy as cp
+            
+            # Transfer to GPU for faster combination
+            left_gpu = cp.asarray(left_eye)
+            right_gpu = cp.asarray(right_eye)
+            
+            # Ensure frames have same height
+            if left_gpu.shape[0] != right_gpu.shape[0]:
+                target_height = min(left_gpu.shape[0], right_gpu.shape[0])
+                # Note: OpenCV resize not available in CuPy, fall back to CPU for resize
+                left_eye = cv2.resize(left_eye, (left_eye.shape[1], target_height))
+                right_eye = cv2.resize(right_eye, (right_eye.shape[1], target_height))
+                left_gpu = cp.asarray(left_eye)
+                right_gpu = cp.asarray(right_eye)
+            
+            # Combine horizontally on GPU (much faster for large arrays)
+            combined_gpu = cp.hstack([left_gpu, right_gpu])
+            
+            # Transfer back to CPU
+            return cp.asnumpy(combined_gpu)
+            
+        except ImportError:
+            # Fallback to CPU NumPy
+            # Ensure frames have same height
+            if left_eye.shape[0] != right_eye.shape[0]:
+                target_height = min(left_eye.shape[0], right_eye.shape[0])
+                left_eye = cv2.resize(left_eye, (left_eye.shape[1], target_height))
+                right_eye = cv2.resize(right_eye, (right_eye.shape[1], target_height))
+            
+            # Combine horizontally
+            combined = np.hstack([left_eye, right_eye])
+            return combined
     
     def process_with_disparity_mapping(self, 
                                      frames: List[np.ndarray], 
@@ -420,7 +444,7 @@ class VR180Processor(VideoProcessor):
                                    left_results: List[np.ndarray], 
                                    right_results: List[np.ndarray]) -> List[np.ndarray]:
         """
-        Validate and correct stereo consistency between left and right eye results
+        Validate and correct stereo consistency between left and right eye results using GPU acceleration
         
         Args:
             left_results: Left eye processed frames
@@ -429,9 +453,84 @@ class VR180Processor(VideoProcessor):
         Returns:
             Validated right eye frames
         """
+        print(f"🔍 VALIDATION: Starting stereo consistency check ({len(left_results)} frames)")
+        
+        try:
+            import cupy as cp
+            return self._validate_stereo_consistency_gpu(left_results, right_results)
+        except ImportError:
+            print("   Warning: CuPy not available, using CPU validation")
+            return self._validate_stereo_consistency_cpu(left_results, right_results)
+    
+    def _validate_stereo_consistency_gpu(self, 
+                                       left_results: List[np.ndarray], 
+                                       right_results: List[np.ndarray]) -> List[np.ndarray]:
+        """GPU-accelerated batch stereo validation using CuPy"""
+        import cupy as cp
+        
+        print("   Using GPU acceleration for stereo validation")
+        
+        # Convert all frames to GPU at once (batch processing)
+        print("   Transferring frames to GPU...")
+        left_stack = cp.stack([cp.asarray(frame) for frame in left_results])
+        right_stack = cp.stack([cp.asarray(frame) for frame in right_results])
+        
+        print("   Computing mask areas on GPU...")
+        
+        # Batch calculate all mask areas
+        if left_stack.shape[3] == 4:  # Alpha channel
+            left_masks = left_stack[:, :, :, 3] > 0
+            right_masks = right_stack[:, :, :, 3] > 0
+        else:  # Green screen detection
+            bg_color = cp.array(self.config.output.background_color)
+            left_diff = cp.abs(left_stack.astype(cp.float32) - bg_color).sum(axis=3)
+            right_diff = cp.abs(right_stack.astype(cp.float32) - bg_color).sum(axis=3)
+            left_masks = left_diff > 30
+            right_masks = right_diff > 30
+        
+        # Calculate all areas at once (massive parallel speedup)
+        left_areas = cp.sum(left_masks, axis=(1, 2))
+        right_areas = cp.sum(right_masks, axis=(1, 2))
+        area_ratios = right_areas.astype(cp.float32) / (left_areas.astype(cp.float32) + 1e-6)
+        
+        # Find frames needing correction
+        needs_correction = (area_ratios < 0.5) | (area_ratios > 2.0)
+        correction_count = int(cp.sum(needs_correction))
+        
+        print(f"   GPU validation complete: {correction_count}/{len(left_results)} frames need correction")
+        
+        # Transfer results back to CPU for processing
+        area_ratios_cpu = cp.asnumpy(area_ratios)
+        needs_correction_cpu = cp.asnumpy(needs_correction)
+        
+        validated_frames = []
+        for i, (needs_fix, ratio) in enumerate(zip(needs_correction_cpu, area_ratios_cpu)):
+            if i % 100 == 0:
+                print(f"   Processing validation results: {i}/{len(left_results)}")
+                
+            if needs_fix:
+                # Apply correction
+                corrected_frame = self._apply_stereo_correction(
+                    left_results[i], right_results[i], float(ratio)
+                )
+                validated_frames.append(corrected_frame)
+            else:
+                validated_frames.append(right_results[i])
+        
+        print("✅ VALIDATION: GPU stereo consistency check complete")
+        return validated_frames
+    
+    def _validate_stereo_consistency_cpu(self, 
+                                       left_results: List[np.ndarray], 
+                                       right_results: List[np.ndarray]) -> List[np.ndarray]:
+        """CPU fallback for stereo validation"""
+        print("   Using CPU validation (slower)")
         validated_frames = []
         
         for i, (left_frame, right_frame) in enumerate(zip(left_results, right_results)):
+            if i % 50 == 0:  # Progress every 50 frames
+                print(f"   CPU validation progress: {i}/{len(left_results)}")
+            
             # Simple validation: check if mask areas are similar
             left_mask_area = self._get_mask_area(left_frame)
             right_mask_area = self._get_mask_area(right_frame)
@@ -448,6 +547,7 @@ class VR180Processor(VideoProcessor):
             else:
                 validated_frames.append(right_frame)
         
+        print("✅ VALIDATION: CPU stereo consistency check complete")
         return validated_frames
     
     def _create_empty_masks_from_count(self, num_frames: int, frame_shape: tuple) -> List[np.ndarray]:
@@ -465,15 +565,34 @@ class VR180Processor(VideoProcessor):
         return empty_frames
     
     def _get_mask_area(self, frame: np.ndarray) -> float:
-        """Get mask area from processed frame"""
-        if frame.shape[2] == 4:  # Alpha channel
-            mask = frame[:, :, 3] > 0
-        else:  # Green screen - detect non-background pixels
-            bg_color = np.array(self.config.output.background_color)
-            diff = np.abs(frame.astype(np.float32) - bg_color).sum(axis=2)
-            mask = diff > 30  # Threshold for non-background
-        
-        return np.sum(mask)
+        """Get mask area from processed frame using GPU acceleration"""
+        try:
+            import cupy as cp
+            
+            # Transfer to GPU
+            frame_gpu = cp.asarray(frame)
+            
+            if frame.shape[2] == 4:  # Alpha channel
+                mask_gpu = frame_gpu[:, :, 3] > 0
+            else:  # Green screen - detect non-background pixels
+                bg_color_gpu = cp.array(self.config.output.background_color)
+                diff_gpu = cp.abs(frame_gpu.astype(cp.float32) - bg_color_gpu).sum(axis=2)
+                mask_gpu = diff_gpu > 30  # Threshold for non-background
+            
+            # Calculate area on GPU and return as Python int
+            area = int(cp.sum(mask_gpu))
+            return area
+            
+        except ImportError:
+            # Fallback to CPU NumPy if CuPy not available
+            if frame.shape[2] == 4:  # Alpha channel
+                mask = frame[:, :, 3] > 0
+            else:  # Green screen - detect non-background pixels
+                bg_color = np.array(self.config.output.background_color)
+                diff = np.abs(frame.astype(np.float32) - bg_color).sum(axis=2)
+                mask = diff > 30  # Threshold for non-background
+            
+            return np.sum(mask)
     
     def _apply_stereo_correction(self, 
                                left_frame: np.ndarray,