From 725a7814560dcf88b1d03f421ca8fe348f522baa Mon Sep 17 00:00:00 2001 From: Scott Register Date: Sat, 26 Jul 2025 12:29:32 -0700 Subject: [PATCH] cupy --- requirements.txt | 5 +- runpod_setup.sh | 22 +++++ vr180_matting/vr180_processor.py | 159 +++++++++++++++++++++++++++---- 3 files changed, 165 insertions(+), 21 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5ef24af..70f6064 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,7 @@ ultralytics>=8.0.0 tqdm>=4.65.0 psutil>=5.9.0 ffmpeg-python>=0.2.0 -decord>=0.6.0 \ No newline at end of file +decord>=0.6.0 +# GPU acceleration (optional but recommended for stereo validation speedup) +# cupy-cuda11x>=12.0.0 # For CUDA 11.x +# cupy-cuda12x>=12.0.0 # For CUDA 12.x - uncomment appropriate version \ No newline at end of file diff --git a/runpod_setup.sh b/runpod_setup.sh index 5aa992c..046ddc0 100755 --- a/runpod_setup.sh +++ b/runpod_setup.sh @@ -18,6 +18,28 @@ pip install -r requirements.txt echo "📹 Installing decord for video processing..." pip install decord +# Install CuPy for GPU acceleration of stereo validation +echo "🚀 Installing CuPy for GPU acceleration..." +# Auto-detect CUDA version and install appropriate CuPy +python -c " +import torch +if torch.cuda.is_available(): + cuda_version = torch.version.cuda + print(f'CUDA version detected: {cuda_version}') + if cuda_version.startswith('11.'): + import subprocess + subprocess.run(['pip', 'install', 'cupy-cuda11x>=12.0.0']) + print('Installed CuPy for CUDA 11.x') + elif cuda_version.startswith('12.'): + import subprocess + subprocess.run(['pip', 'install', 'cupy-cuda12x>=12.0.0']) + print('Installed CuPy for CUDA 12.x') + else: + print(f'Unsupported CUDA version: {cuda_version}') +else: + print('CUDA not available, skipping CuPy installation') +" + # Install SAM2 separately (not on PyPI) echo "🎯 Installing SAM2..." pip install git+https://github.com/facebookresearch/segment-anything-2.git diff --git a/vr180_matting/vr180_processor.py b/vr180_matting/vr180_processor.py index 9c375ae..145ad30 100644 --- a/vr180_matting/vr180_processor.py +++ b/vr180_matting/vr180_processor.py @@ -89,7 +89,7 @@ class VR180Processor(VideoProcessor): def combine_sbs_frame(self, left_eye: np.ndarray, right_eye: np.ndarray) -> np.ndarray: """ - Combine left and right eye frames back into side-by-side format + Combine left and right eye frames back into side-by-side format with GPU acceleration Args: left_eye: Left eye frame @@ -98,15 +98,39 @@ class VR180Processor(VideoProcessor): Returns: Combined SBS frame """ - # Ensure frames have same height - if left_eye.shape[0] != right_eye.shape[0]: - target_height = min(left_eye.shape[0], right_eye.shape[0]) - left_eye = cv2.resize(left_eye, (left_eye.shape[1], target_height)) - right_eye = cv2.resize(right_eye, (right_eye.shape[1], target_height)) - - # Combine horizontally - combined = np.hstack([left_eye, right_eye]) - return combined + try: + import cupy as cp + + # Transfer to GPU for faster combination + left_gpu = cp.asarray(left_eye) + right_gpu = cp.asarray(right_eye) + + # Ensure frames have same height + if left_gpu.shape[0] != right_gpu.shape[0]: + target_height = min(left_gpu.shape[0], right_gpu.shape[0]) + # Note: OpenCV resize not available in CuPy, fall back to CPU for resize + left_eye = cv2.resize(left_eye, (left_eye.shape[1], target_height)) + right_eye = cv2.resize(right_eye, (right_eye.shape[1], target_height)) + left_gpu = cp.asarray(left_eye) + right_gpu = cp.asarray(right_eye) + + # Combine horizontally on GPU (much faster for large arrays) + combined_gpu = cp.hstack([left_gpu, right_gpu]) + + # Transfer back to CPU + return cp.asnumpy(combined_gpu) + + except ImportError: + # Fallback to CPU NumPy + # Ensure frames have same height + if left_eye.shape[0] != right_eye.shape[0]: + target_height = min(left_eye.shape[0], right_eye.shape[0]) + left_eye = cv2.resize(left_eye, (left_eye.shape[1], target_height)) + right_eye = cv2.resize(right_eye, (right_eye.shape[1], target_height)) + + # Combine horizontally + combined = np.hstack([left_eye, right_eye]) + return combined def process_with_disparity_mapping(self, frames: List[np.ndarray], @@ -420,7 +444,7 @@ class VR180Processor(VideoProcessor): left_results: List[np.ndarray], right_results: List[np.ndarray]) -> List[np.ndarray]: """ - Validate and correct stereo consistency between left and right eye results + Validate and correct stereo consistency between left and right eye results using GPU acceleration Args: left_results: Left eye processed frames @@ -429,9 +453,84 @@ class VR180Processor(VideoProcessor): Returns: Validated right eye frames """ + print(f"🔍 VALIDATION: Starting stereo consistency check ({len(left_results)} frames)") + + try: + import cupy as cp + return self._validate_stereo_consistency_gpu(left_results, right_results) + except ImportError: + print(" Warning: CuPy not available, using CPU validation") + return self._validate_stereo_consistency_cpu(left_results, right_results) + + def _validate_stereo_consistency_gpu(self, + left_results: List[np.ndarray], + right_results: List[np.ndarray]) -> List[np.ndarray]: + """GPU-accelerated batch stereo validation using CuPy""" + import cupy as cp + + print(" Using GPU acceleration for stereo validation") + + # Convert all frames to GPU at once (batch processing) + print(" Transferring frames to GPU...") + left_stack = cp.stack([cp.asarray(frame) for frame in left_results]) + right_stack = cp.stack([cp.asarray(frame) for frame in right_results]) + + print(" Computing mask areas on GPU...") + + # Batch calculate all mask areas + if left_stack.shape[3] == 4: # Alpha channel + left_masks = left_stack[:, :, :, 3] > 0 + right_masks = right_stack[:, :, :, 3] > 0 + else: # Green screen detection + bg_color = cp.array(self.config.output.background_color) + left_diff = cp.abs(left_stack.astype(cp.float32) - bg_color).sum(axis=3) + right_diff = cp.abs(right_stack.astype(cp.float32) - bg_color).sum(axis=3) + left_masks = left_diff > 30 + right_masks = right_diff > 30 + + # Calculate all areas at once (massive parallel speedup) + left_areas = cp.sum(left_masks, axis=(1, 2)) + right_areas = cp.sum(right_masks, axis=(1, 2)) + area_ratios = right_areas.astype(cp.float32) / (left_areas.astype(cp.float32) + 1e-6) + + # Find frames needing correction + needs_correction = (area_ratios < 0.5) | (area_ratios > 2.0) + correction_count = int(cp.sum(needs_correction)) + + print(f" GPU validation complete: {correction_count}/{len(left_results)} frames need correction") + + # Transfer results back to CPU for processing + area_ratios_cpu = cp.asnumpy(area_ratios) + needs_correction_cpu = cp.asnumpy(needs_correction) + + validated_frames = [] + for i, (needs_fix, ratio) in enumerate(zip(needs_correction_cpu, area_ratios_cpu)): + if i % 100 == 0: + print(f" Processing validation results: {i}/{len(left_results)}") + + if needs_fix: + # Apply correction + corrected_frame = self._apply_stereo_correction( + left_results[i], right_results[i], float(ratio) + ) + validated_frames.append(corrected_frame) + else: + validated_frames.append(right_results[i]) + + print("✅ VALIDATION: GPU stereo consistency check complete") + return validated_frames + + def _validate_stereo_consistency_cpu(self, + left_results: List[np.ndarray], + right_results: List[np.ndarray]) -> List[np.ndarray]: + """CPU fallback for stereo validation""" + print(" Using CPU validation (slower)") validated_frames = [] for i, (left_frame, right_frame) in enumerate(zip(left_results, right_results)): + if i % 50 == 0: # Progress every 50 frames + print(f" CPU validation progress: {i}/{len(left_results)}") + # Simple validation: check if mask areas are similar left_mask_area = self._get_mask_area(left_frame) right_mask_area = self._get_mask_area(right_frame) @@ -448,6 +547,7 @@ class VR180Processor(VideoProcessor): else: validated_frames.append(right_frame) + print("✅ VALIDATION: CPU stereo consistency check complete") return validated_frames def _create_empty_masks_from_count(self, num_frames: int, frame_shape: tuple) -> List[np.ndarray]: @@ -465,15 +565,34 @@ class VR180Processor(VideoProcessor): return empty_frames def _get_mask_area(self, frame: np.ndarray) -> float: - """Get mask area from processed frame""" - if frame.shape[2] == 4: # Alpha channel - mask = frame[:, :, 3] > 0 - else: # Green screen - detect non-background pixels - bg_color = np.array(self.config.output.background_color) - diff = np.abs(frame.astype(np.float32) - bg_color).sum(axis=2) - mask = diff > 30 # Threshold for non-background - - return np.sum(mask) + """Get mask area from processed frame using GPU acceleration""" + try: + import cupy as cp + + # Transfer to GPU + frame_gpu = cp.asarray(frame) + + if frame.shape[2] == 4: # Alpha channel + mask_gpu = frame_gpu[:, :, 3] > 0 + else: # Green screen - detect non-background pixels + bg_color_gpu = cp.array(self.config.output.background_color) + diff_gpu = cp.abs(frame_gpu.astype(cp.float32) - bg_color_gpu).sum(axis=2) + mask_gpu = diff_gpu > 30 # Threshold for non-background + + # Calculate area on GPU and return as Python int + area = int(cp.sum(mask_gpu)) + return area + + except ImportError: + # Fallback to CPU NumPy if CuPy not available + if frame.shape[2] == 4: # Alpha channel + mask = frame[:, :, 3] > 0 + else: # Green screen - detect non-background pixels + bg_color = np.array(self.config.output.background_color) + diff = np.abs(frame.astype(np.float32) - bg_color).sum(axis=2) + mask = diff > 30 # Threshold for non-background + + return np.sum(mask) def _apply_stereo_correction(self, left_frame: np.ndarray,