dont use predictor over and over

not too hard
old sam cleanup
2025-07-26 13:40:47 -07:00 · 2025-07-26 13:30:13 -07:00 · 2025-07-26 13:21:39 -07:00 · 2025-07-26 13:03:04 -07:00 · 2025-07-26 12:42:16 -07:00 · 2025-07-26 12:29:32 -07:00
5 changed files with 296 additions and 29 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,6 @@ tqdm>=4.65.0
 psutil>=5.9.0
 ffmpeg-python>=0.2.0
 decord>=0.6.0
 # GPU acceleration (optional but recommended for stereo validation speedup)
 # cupy-cuda11x>=12.0.0  # For CUDA 11.x
 # cupy-cuda12x>=12.0.0  # For CUDA 12.x - uncomment appropriate version
--- a/runpod_setup.sh
+++ b/runpod_setup.sh
@@ -18,6 +18,28 @@ pip install -r requirements.txt
 echo "📹 Installing decord for video processing..."
 pip install decord
 # Install CuPy for GPU acceleration of stereo validation
 echo "🚀 Installing CuPy for GPU acceleration..."
 # Auto-detect CUDA version and install appropriate CuPy
 python -c "
 import torch
 if torch.cuda.is_available():
    cuda_version = torch.version.cuda
    print(f'CUDA version detected: {cuda_version}')
    if cuda_version.startswith('11.'):
        import subprocess
        subprocess.run(['pip', 'install', 'cupy-cuda11x>=12.0.0'])
        print('Installed CuPy for CUDA 11.x')
    elif cuda_version.startswith('12.'):
        import subprocess
        subprocess.run(['pip', 'install', 'cupy-cuda12x>=12.0.0'])
        print('Installed CuPy for CUDA 12.x')
    else:
        print(f'Unsupported CUDA version: {cuda_version}')
 else:
    print('CUDA not available, skipping CuPy installation')
 "
 # Install SAM2 separately (not on PyPI)
 echo "🎯 Installing SAM2..."
 pip install git+https://github.com/facebookresearch/segment-anything-2.git
--- a/vr180_matting/sam2_wrapper.py
+++ b/vr180_matting/sam2_wrapper.py
@@ -7,6 +7,7 @@ import warnings
 import os
 import tempfile
 import shutil
 import gc
 try:
    from sam2.build_sam import build_sam2_video_predictor
@@ -32,6 +33,8 @@ class SAM2VideoMatting:
        self.device = device
        self.memory_offload = memory_offload
        self.fp16 = fp16
        self.model_cfg = model_cfg
        self.checkpoint_path = checkpoint_path
        self.predictor = None
        self.inference_state = None
        self.video_segments = {}
@@ -74,7 +77,8 @@ class SAM2VideoMatting:
    def init_video_state(self, video_frames: List[np.ndarray] = None, video_path: str = None) -> None:
        """Initialize video inference state"""
        if self.predictor is None:
-            raise RuntimeError("SAM2 model not loaded")
+            # Recreate predictor if it was cleaned up
            self._load_model(self.model_cfg, self.checkpoint_path)
        if video_path is not None:
            # Use video path directly (SAM2's preferred method)
@@ -256,11 +260,23 @@ class SAM2VideoMatting:
        """Clean up resources"""
        if self.inference_state is not None:
            try:
-                if hasattr(self.predictor, 'cleanup_state'):
+                # Reset SAM2 state first (critical for memory cleanup)
                if self.predictor is not None and hasattr(self.predictor, 'reset_state'):
                    self.predictor.reset_state(self.inference_state)
                # Fallback to cleanup_state if available
                elif self.predictor is not None and hasattr(self.predictor, 'cleanup_state'):
                    self.predictor.cleanup_state(self.inference_state)
                # Explicitly delete inference state and video segments
                del self.inference_state
                if hasattr(self, 'video_segments') and self.video_segments:
                    del self.video_segments
                    self.video_segments = {}
            except Exception as e:
                warnings.warn(f"Failed to cleanup SAM2 state: {e}")
-            
+            finally:
                self.inference_state = None
        # Clean up temporary video file
@@ -277,6 +293,22 @@ class SAM2VideoMatting:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        # Explicitly delete predictor for fresh creation next time
        if self.predictor is not None:
            try:
                del self.predictor
            except Exception as e:
                warnings.warn(f"Failed to delete predictor: {e}")
            finally:
                self.predictor = None
        # Force garbage collection (critical for memory leak prevention)
        gc.collect()
    def __del__(self):
        """Destructor to ensure cleanup"""
        try:
            self.cleanup()
        except Exception:
            # Ignore errors during Python shutdown
            pass
--- a/vr180_matting/video_processor.py
+++ b/vr180_matting/video_processor.py
@@ -132,6 +132,26 @@ class VideoProcessor:
        except ImportError:
            pass
        # Clear OpenCV internal caches
        try:
            # Clear OpenCV video capture cache
            cv2.setUseOptimized(False)
            cv2.setUseOptimized(True)
        except Exception:
            pass
        # Clear CuPy caches if available  
        try:
            import cupy as cp
            cp._default_memory_pool.free_all_blocks()
            cp._default_pinned_memory_pool.free_all_blocks()
            cp.get_default_memory_pool().free_all_blocks()
            cp.get_default_pinned_memory_pool().free_all_blocks()
        except ImportError:
            pass
        except Exception as e:
            print(f"   Warning: Could not clear CuPy cache: {e}")
        # Force Linux to release memory back to OS
        if sys.platform == 'linux':
            try:
@@ -623,16 +643,27 @@ class VideoProcessor:
            # Load and merge chunks from disk
            print("\nLoading and merging chunks...")
            chunk_results = []
-            for chunk_file in chunk_files:
+            for i, chunk_file in enumerate(chunk_files):
                print(f"Loading {chunk_file.name}...")
                chunk_data = np.load(str(chunk_file))
                chunk_results.append(chunk_data['frames'])
                chunk_data.close()  # Close the file
                # Delete chunk file immediately after loading to free disk space
                try:
                    chunk_file.unlink()
                    print(f"   Deleted chunk file {chunk_file.name}")
                except Exception as e:
                    print(f"   Warning: Could not delete chunk file: {e}")
                # Aggressive cleanup every few chunks to prevent accumulation
                if i % 3 == 0 and i > 0:
                    self._aggressive_memory_cleanup(f"after loading chunk {i}")
            # Merge chunks
            final_frames = self.merge_overlapping_chunks(chunk_results, overlap_frames)
-            # Free chunk results after merging
+            # Free chunk results after merging - this is critical!
            del chunk_results
            self._aggressive_memory_cleanup("after merging chunks")
--- a/vr180_matting/vr180_processor.py
+++ b/vr180_matting/vr180_processor.py
@@ -89,7 +89,7 @@ class VR180Processor(VideoProcessor):
    def combine_sbs_frame(self, left_eye: np.ndarray, right_eye: np.ndarray) -> np.ndarray:
        """
-        Combine left and right eye frames back into side-by-side format
+        Combine left and right eye frames back into side-by-side format with GPU acceleration
        Args:
            left_eye: Left eye frame
@@ -98,14 +98,44 @@ class VR180Processor(VideoProcessor):
        Returns:
            Combined SBS frame
        """
        try:
            import cupy as cp
            # Transfer to GPU for faster combination
            left_gpu = cp.asarray(left_eye)
            right_gpu = cp.asarray(right_eye)
            # Ensure frames have same height
            if left_gpu.shape[0] != right_gpu.shape[0]:
                target_height = min(left_gpu.shape[0], right_gpu.shape[0])
                # Note: OpenCV resize not available in CuPy, fall back to CPU for resize
                left_eye = cv2.resize(left_eye, (left_eye.shape[1], target_height))
                right_eye = cv2.resize(right_eye, (right_eye.shape[1], target_height))
                left_gpu = cp.asarray(left_eye)
                right_gpu = cp.asarray(right_eye)
            # Combine horizontally on GPU (much faster for large arrays)
            combined_gpu = cp.hstack([left_gpu, right_gpu])
            # Transfer back to CPU and ensure we get a copy, not a view
            combined = cp.asnumpy(combined_gpu).copy()
            # Free GPU memory immediately
            del left_gpu, right_gpu, combined_gpu
            cp._default_memory_pool.free_all_blocks()
            return combined
        except ImportError:
            # Fallback to CPU NumPy
            # Ensure frames have same height
            if left_eye.shape[0] != right_eye.shape[0]:
                target_height = min(left_eye.shape[0], right_eye.shape[0])
                left_eye = cv2.resize(left_eye, (left_eye.shape[1], target_height))
                right_eye = cv2.resize(right_eye, (right_eye.shape[1], target_height))
-        # Combine horizontally
+            # Combine horizontally and ensure we get a copy, not a view
-        combined = np.hstack([left_eye, right_eye])
+            combined = np.hstack([left_eye, right_eye]).copy()
            return combined
    def process_with_disparity_mapping(self, 
@@ -152,6 +182,10 @@ class VR180Processor(VideoProcessor):
        with self.memory_manager.memory_monitor(f"left eye chunk {chunk_idx}"):
            left_matted = self._process_eye_sequence(left_eye_frames, "left", chunk_idx)
        # Free left eye frames after processing (before right eye to save memory)
        del left_eye_frames
        self._aggressive_memory_cleanup(f"After left eye processing chunk {chunk_idx}")
        # Process right eye with cross-validation
        print("Processing right eye with cross-validation...")
        with self.memory_manager.memory_monitor(f"right eye chunk {chunk_idx}"):
@@ -159,6 +193,10 @@ class VR180Processor(VideoProcessor):
                right_eye_frames, left_matted, "right", chunk_idx
            )
        # Free right eye frames after processing
        del right_eye_frames
        self._aggressive_memory_cleanup(f"After right eye processing chunk {chunk_idx}")
        # Combine results back to SBS format
        combined_frames = []
        for left_frame, right_frame in zip(left_matted, right_matted):
@@ -169,6 +207,11 @@ class VR180Processor(VideoProcessor):
                combined = {'left': left_frame, 'right': right_frame}
            combined_frames.append(combined)
        # Free the individual eye results after combining
        del left_matted
        del right_matted
        self._aggressive_memory_cleanup(f"After combining frames chunk {chunk_idx}")
        return combined_frames
    def _process_eye_sequence(self, 
@@ -371,8 +414,9 @@ class VR180Processor(VideoProcessor):
                matted_frames.append(matted_frame)
-            # Free reloaded frames 
+            # Free reloaded frames and video segments completely
            del reloaded_frames
            del video_segments  # This holds processed masks from SAM2
            self._aggressive_memory_cleanup(f"After mask application ({eye_name} eye)")
            return matted_frames
@@ -414,13 +458,17 @@ class VR180Processor(VideoProcessor):
            left_eye_results, right_matted
        )
        # CRITICAL: Free the intermediate results to prevent memory accumulation
        del left_eye_results  # Don't keep left eye results after validation
        del right_matted     # Don't keep unvalidated right results
        return validated_results
    def _validate_stereo_consistency(self, 
                                   left_results: List[np.ndarray], 
                                   right_results: List[np.ndarray]) -> List[np.ndarray]:
        """
-        Validate and correct stereo consistency between left and right eye results
+        Validate and correct stereo consistency between left and right eye results using GPU acceleration
        Args:
            left_results: Left eye processed frames
@@ -429,9 +477,120 @@ class VR180Processor(VideoProcessor):
        Returns:
            Validated right eye frames
        """
        print(f"🔍 VALIDATION: Starting stereo consistency check ({len(left_results)} frames)")
        try:
            import cupy as cp
            return self._validate_stereo_consistency_gpu(left_results, right_results)
        except ImportError:
            print("   Warning: CuPy not available, using CPU validation")
            return self._validate_stereo_consistency_cpu(left_results, right_results)
    def _validate_stereo_consistency_gpu(self, 
                                       left_results: List[np.ndarray], 
                                       right_results: List[np.ndarray]) -> List[np.ndarray]:
        """GPU-accelerated batch stereo validation using CuPy with memory-safe batching"""
        import cupy as cp
        print("   Using GPU acceleration for stereo validation")
        # Process in batches to avoid GPU OOM
        batch_size = 50  # Process 50 frames at a time (safe for 45GB GPU)
        total_frames = len(left_results)
        area_ratios_all = []
        needs_correction_all = []
        print(f"   Processing {total_frames} frames in batches of {batch_size}...")
        for batch_start in range(0, total_frames, batch_size):
            batch_end = min(batch_start + batch_size, total_frames)
            batch_frames = batch_end - batch_start
            if batch_start % 100 == 0:
                print(f"   GPU batch {batch_start//batch_size + 1}: frames {batch_start}-{batch_end}")
            # Get batch slices
            left_batch = left_results[batch_start:batch_end]
            right_batch = right_results[batch_start:batch_end]
            # Convert batch to GPU
            left_stack = cp.stack([cp.asarray(frame) for frame in left_batch])
            right_stack = cp.stack([cp.asarray(frame) for frame in right_batch])
            # Batch calculate mask areas for this batch
            if left_stack.shape[3] == 4:  # Alpha channel
                left_masks = left_stack[:, :, :, 3] > 0
                right_masks = right_stack[:, :, :, 3] > 0
            else:  # Green screen detection
                bg_color = cp.array(self.config.output.background_color)
                left_diff = cp.abs(left_stack.astype(cp.float32) - bg_color).sum(axis=3)
                right_diff = cp.abs(right_stack.astype(cp.float32) - bg_color).sum(axis=3)
                left_masks = left_diff > 30
                right_masks = right_diff > 30
            # Calculate areas for this batch
            left_areas = cp.sum(left_masks, axis=(1, 2))
            right_areas = cp.sum(right_masks, axis=(1, 2))
            area_ratios = right_areas.astype(cp.float32) / (left_areas.astype(cp.float32) + 1e-6)
            # Find frames needing correction in this batch
            needs_correction = (area_ratios < 0.5) | (area_ratios > 2.0)
            # Transfer batch results back to CPU and accumulate
            area_ratios_all.extend(cp.asnumpy(area_ratios))
            needs_correction_all.extend(cp.asnumpy(needs_correction))
            # Free GPU memory for this batch
            del left_stack, right_stack, left_masks, right_masks
            del left_areas, right_areas, area_ratios, needs_correction
            cp._default_memory_pool.free_all_blocks()
        # CRITICAL: Release ALL CuPy memory back to system after validation
        try:
            # Force release of all GPU memory pools
            cp._default_memory_pool.free_all_blocks()
            cp._default_pinned_memory_pool.free_all_blocks()
            # Clear CuPy cache completely
            cp.get_default_memory_pool().free_all_blocks()
            cp.get_default_pinned_memory_pool().free_all_blocks()
            print(f"   CuPy memory pools cleared")
        except Exception as e:
            print(f"   Warning: Could not clear CuPy memory pools: {e}")
        correction_count = sum(needs_correction_all)
        print(f"   GPU validation complete: {correction_count}/{total_frames} frames need correction")
        # Apply corrections using CPU results
        validated_frames = []
        for i, (needs_fix, ratio) in enumerate(zip(needs_correction_all, area_ratios_all)):
            if i % 100 == 0:
                print(f"   Processing validation results: {i}/{total_frames}")
            if needs_fix:
                # Apply correction
                corrected_frame = self._apply_stereo_correction(
                    left_results[i], right_results[i], float(ratio)
                )
                validated_frames.append(corrected_frame)
            else:
                validated_frames.append(right_results[i])
        print("✅ VALIDATION: GPU stereo consistency check complete")
        return validated_frames
    def _validate_stereo_consistency_cpu(self, 
                                       left_results: List[np.ndarray], 
                                       right_results: List[np.ndarray]) -> List[np.ndarray]:
        """CPU fallback for stereo validation"""
        print("   Using CPU validation (slower)")
        validated_frames = []
        for i, (left_frame, right_frame) in enumerate(zip(left_results, right_results)):
            if i % 50 == 0:  # Progress every 50 frames
                print(f"   CPU validation progress: {i}/{len(left_results)}")
            # Simple validation: check if mask areas are similar
            left_mask_area = self._get_mask_area(left_frame)
            right_mask_area = self._get_mask_area(right_frame)
@@ -448,6 +607,7 @@ class VR180Processor(VideoProcessor):
            else:
                validated_frames.append(right_frame)
        print("✅ VALIDATION: CPU stereo consistency check complete")
        return validated_frames
    def _create_empty_masks_from_count(self, num_frames: int, frame_shape: tuple) -> List[np.ndarray]:
@@ -465,7 +625,26 @@ class VR180Processor(VideoProcessor):
        return empty_frames
    def _get_mask_area(self, frame: np.ndarray) -> float:
-        """Get mask area from processed frame"""
+        """Get mask area from processed frame using GPU acceleration"""
        try:
            import cupy as cp
            # Transfer to GPU
            frame_gpu = cp.asarray(frame)
            if frame.shape[2] == 4:  # Alpha channel
                mask_gpu = frame_gpu[:, :, 3] > 0
            else:  # Green screen - detect non-background pixels
                bg_color_gpu = cp.array(self.config.output.background_color)
                diff_gpu = cp.abs(frame_gpu.astype(cp.float32) - bg_color_gpu).sum(axis=2)
                mask_gpu = diff_gpu > 30  # Threshold for non-background
            # Calculate area on GPU and return as Python int
            area = int(cp.sum(mask_gpu))
            return area
        except ImportError:
            # Fallback to CPU NumPy if CuPy not available
            if frame.shape[2] == 4:  # Alpha channel
                mask = frame[:, :, 3] > 0
            else:  # Green screen - detect non-background pixels
Author	SHA1	Message	Date
Scott Register	6f93abcb08	dont use predictor over and over	2025-07-26 13:40:47 -07:00
Scott Register	c368d6dc97	not too hard	2025-07-26 13:30:13 -07:00
Scott Register	e7e9c5597b	old sam cleanup	2025-07-26 13:21:39 -07:00
Scott Register	3af16df71e	more memleak fixes	2025-07-26 13:03:04 -07:00
Scott Register	df7b009a7b	fix gpu memory issue	2025-07-26 12:42:16 -07:00
Scott Register	725a781456	cupy	2025-07-26 12:29:32 -07:00