Compare commits

19 Commits

Author SHA1 Message Date
ccc68a3895 memleak fix hopefully 2025-07-26 12:25:55 -07:00
463f881eaf catagory A round 2 2025-07-26 11:56:51 -07:00
b642b562f0 optimizations A round 1 2025-07-26 11:04:04 -07:00
40ae537f7a memory stuff 2025-07-26 09:56:39 -07:00
28aa663b7b debug data 2025-07-26 09:31:50 -07:00
0244ba5204 fix some stuff 2025-07-26 09:24:30 -07:00
141302cccf ffmpegize 2025-07-26 09:16:45 -07:00
6b0eb6104d debug data 2025-07-26 09:14:11 -07:00
0f8818259e debug data 2025-07-26 09:10:59 -07:00
86274ba04a video debug 2025-07-26 09:07:57 -07:00
99c4da83af fix temp file 2025-07-26 09:01:38 -07:00
c4af7baf3d decord 2025-07-26 08:55:27 -07:00
3e21fd8678 fix again 2025-07-26 08:54:03 -07:00
d933d6b606 fix wrapper 2025-07-26 08:51:48 -07:00
7852303b40 maybe fix 2025-07-26 08:47:50 -07:00
e195d23584 make exec 2025-07-26 08:43:42 -07:00
eb9529b4ff please fix 2025-07-26 08:43:18 -07:00
a7c7cfbcba maybe fix 2025-07-26 08:39:35 -07:00
6ea3d3ae5d fix config path 2025-07-26 08:34:13 -07:00
8 changed files with 833 additions and 111 deletions

View File

@@ -3,8 +3,8 @@ input:
processing: processing:
scale_factor: 0.5 # A40 can handle 0.5 well scale_factor: 0.5 # A40 can handle 0.5 well
chunk_size: 0 # Auto-calculate based on A40's 48GB VRAM chunk_size: 600 # Category A.4: Larger chunks for better VRAM utilization (was 200)
overlap_frames: 60 overlap_frames: 30 # Reduced overlap
detection: detection:
confidence_threshold: 0.7 confidence_threshold: 0.7
@@ -14,14 +14,16 @@ matting:
use_disparity_mapping: true use_disparity_mapping: true
memory_offload: false # A40 has enough VRAM memory_offload: false # A40 has enough VRAM
fp16: true fp16: true
sam2_model_cfg: "sam2.1_hiera_l" sam2_model_cfg: "configs/sam2.1/sam2.1_hiera_l.yaml"
sam2_checkpoint: "segment-anything-2/checkpoints/sam2.1_hiera_large.pt" sam2_checkpoint: "segment-anything-2/checkpoints/sam2.1_hiera_large.pt"
output: output:
path: "/workspace/output/matted_video.mp4" path: "/workspace/output/matted_video.mp4"
format: "alpha" format: "greenscreen" # Changed to greenscreen for easier testing
background_color: [0, 255, 0] background_color: [0, 255, 0]
maintain_sbs: true maintain_sbs: true
preserve_audio: true # Category A.1: Audio preservation
verify_sync: true # Category A.2: Frame count validation
hardware: hardware:
device: "cuda" device: "cuda"

View File

@@ -9,3 +9,4 @@ ultralytics>=8.0.0
tqdm>=4.65.0 tqdm>=4.65.0
psutil>=5.9.0 psutil>=5.9.0
ffmpeg-python>=0.2.0 ffmpeg-python>=0.2.0
decord>=0.6.0

4
runpod_setup.sh Normal file → Executable file
View File

@@ -14,6 +14,10 @@ echo "🐍 Installing Python dependencies..."
pip install --upgrade pip pip install --upgrade pip
pip install -r requirements.txt pip install -r requirements.txt
# Install decord for SAM2 video loading
echo "📹 Installing decord for video processing..."
pip install decord
# Install SAM2 separately (not on PyPI) # Install SAM2 separately (not on PyPI)
echo "🎯 Installing SAM2..." echo "🎯 Installing SAM2..."
pip install git+https://github.com/facebookresearch/segment-anything-2.git pip install git+https://github.com/facebookresearch/segment-anything-2.git

198
spec.md
View File

@@ -123,6 +123,204 @@ hardware:
3. **Performance Profiling**: Detailed resource usage analytics 3. **Performance Profiling**: Detailed resource usage analytics
4. **Quality Validation**: Comprehensive testing suite 4. **Quality Validation**: Comprehensive testing suite
## Post-Implementation Optimization Opportunities
*Based on first successful 30-second test clip execution results (A40 GPU, 50% scale, 9x200 frame chunks)*
### Performance Analysis Findings
- **Processing Speed**: ~0.54s per frame (64.4s for 120 frames per chunk)
- **VRAM Utilization**: Only 2.5% (1.11GB of 45GB available) - significantly underutilized
- **RAM Usage**: 106GB used of 494GB available (21.5%)
- **Primary Bottleneck**: Intermediate ffmpeg encoding operations per chunk
### Identified Optimization Categories
#### Category A: Performance Improvements (Quick Wins)
1. **Audio Track Preservation** ⚠️ **CRITICAL**
- Issue: Output video missing audio track from input
- Solution: Use ffmpeg to copy audio stream during final video creation
- Implementation: Add `-c:a copy` to final ffmpeg command
- Impact: Essential for production usability
- Risk: Low, standard ffmpeg operation
2. **Frame Count Synchronization** ⚠️ **CRITICAL**
- Issue: Audio sync drift if input/output frame counts differ
- Solution: Validate exact frame count preservation throughout pipeline
- Implementation: Frame count verification + duration matching
- Impact: Prevents audio desync in long videos
- Risk: Low, validation feature
3. **Memory Usage Reality Check** ⚠️ **IMPORTANT**
- Current assumption: Unlimited RAM for memory-only pipeline
- Reality: RunPod container limited to ~48GB RAM
- Risk calculation: 1-hour video = ~213k frames = potential 20-40GB+ memory usage
- Solution: Implement streaming output instead of full in-memory accumulation
- Impact: Enables processing of long-form content
- Risk: Medium, requires pipeline restructuring
4. **Larger Chunk Sizes**
- Current: 200 frames per chunk (conservative for 10GB RTX 3080)
- Opportunity: 600-800 frames per chunk on high-VRAM systems
- Impact: Reduce 9 chunks to 2-3 chunks, fewer intermediate operations
- Risk: Low, easily configurable
5. **Streaming Output Pipeline**
- Current: Accumulate all processed frames in memory, write once
- Opportunity: Write processed chunks to temporary segments, merge at end
- Impact: Constant memory usage regardless of video length
- Risk: Medium, requires temporary file management
6. **Enhanced Performance Profiling**
- Current: Basic memory monitoring
- Opportunity: Detailed timing per processing stage (detection, propagation, encoding)
- Impact: Identify exact bottlenecks for targeted optimization
- Risk: Low, debugging feature
7. **Parallel Eye Processing**
- Current: Sequential left eye → right eye processing
- Opportunity: Process both eyes simultaneously
- Impact: Potential 50% speedup, better GPU utilization
- Risk: Medium, memory management complexity
#### Category B: Stereo Consistency Fixes (Critical for VR)
1. **Master-Slave Eye Processing**
- Issue: Independent detection leads to mismatched person counts between eyes
- Solution: Use left eye detections as "seeds" for right eye processing
- Impact: Ensures identical person detection across stereo pair
- Risk: Low, maintains current quality while improving consistency
2. **Cross-Eye Detection Validation**
- Issue: Hair/clothing included on one eye but not the other
- Solution: Compare detection results, flag inconsistencies for reprocessing
- Impact: 90%+ stereo alignment improvement
- Risk: Low, fallback to current behavior
3. **Disparity-Aware Segmentation**
- Issue: Segmentation boundaries differ between eyes despite same person
- Solution: Use stereo disparity to correlate features between eyes
- Impact: True stereo-consistent matting
- Risk: High, complex implementation
4. **Joint Stereo Detection**
- Issue: YOLO runs independently on each eye
- Solution: Run YOLO on full SBS frame, split detections spatially
- Impact: Guaranteed identical detection counts
- Risk: Medium, requires detection coordinate mapping
#### Category C: Advanced Optimizations (Future)
1. **Adaptive Memory Management**
- Opportunity: Dynamic chunk sizing based on real-time VRAM usage
- Impact: Optimal resource utilization across different hardware
- Risk: Medium, complex heuristics
2. **Multi-Resolution Processing**
- Opportunity: Initial processing at lower resolution, edge refinement at full
- Impact: Speed improvement while maintaining quality
- Risk: Medium, quality validation required
3. **Enhanced Workflow Documentation**
- Issue: Unclear intermediate data lifecycle
- Solution: Detailed logging of chunk processing, optional intermediate preservation
- Impact: Better debugging and user understanding
- Risk: Low, documentation feature
### Implementation Strategy
- **Phase A**: Quick performance wins (larger chunks, profiling)
- **Phase B**: Stereo consistency (master-slave, validation)
- **Phase C**: Advanced features (disparity-aware, memory optimization)
### Configuration Extensions Required
```yaml
processing:
chunk_size: 600 # Increase from 200 for high-VRAM systems
memory_pipeline: false # Skip intermediate video creation (disabled due to RAM limits)
streaming_output: true # Write chunks progressively instead of accumulating
parallel_eyes: false # Process eyes simultaneously
max_memory_gb: 40 # Realistic RAM limit for RunPod containers
audio:
preserve_audio: true # Copy audio track from input to output
verify_sync: true # Validate frame count and duration matching
audio_codec: "copy" # Preserve original audio codec
stereo:
consistency_mode: "master_slave" # "independent", "master_slave", "joint"
validation_threshold: 0.8 # Similarity threshold between eyes
correction_method: "transfer" # "transfer", "reprocess", "ensemble"
performance:
profile_enabled: true # Detailed timing analysis
preserve_intermediates: false # For debugging workflow
debugging:
log_intermediate_workflow: true # Document chunk lifecycle
save_detection_visualization: false # Debug detection mismatches
frame_count_validation: true # Ensure exact frame preservation
```
### Technical Implementation Details
#### Audio Preservation Implementation
```python
# During final video save, include audio stream copy
ffmpeg_cmd = [
'ffmpeg', '-y',
'-framerate', str(fps),
'-i', frame_pattern, # Video frames
'-i', input_video_path, # Original video for audio
'-c:v', 'h264_nvenc', # GPU video codec (with CPU fallback)
'-c:a', 'copy', # Copy audio without re-encoding
'-map', '0:v:0', # Map video from first input
'-map', '1:a:0', # Map audio from second input
'-shortest', # Match shortest stream duration
output_path
]
```
#### Streaming Output Implementation
```python
# Instead of accumulating frames in memory:
class StreamingVideoWriter:
def __init__(self, output_path, fps, audio_source):
self.temp_segments = []
self.current_segment = 0
def write_chunk(self, processed_frames):
# Write chunk to temporary segment
segment_path = f"temp_segment_{self.current_segment}.mp4"
self.write_video_segment(processed_frames, segment_path)
self.temp_segments.append(segment_path)
self.current_segment += 1
def finalize(self):
# Merge all segments with audio preservation
self.merge_segments_with_audio()
```
#### Memory Usage Calculation
```python
def estimate_memory_requirements(duration_seconds, fps, resolution_scale=0.5):
"""Calculate memory usage for different video lengths"""
frames = duration_seconds * fps
# Per-frame memory (rough estimates for VR180 at 50% scale)
frame_size_mb = (3072 * 1536 * 3 * 4) / (1024 * 1024) # ~18MB per frame
total_memory_gb = (frames * frame_size_mb) / 1024
return {
'duration': duration_seconds,
'total_frames': frames,
'estimated_memory_gb': total_memory_gb,
'safe_for_48gb': total_memory_gb < 40
}
# Example outputs:
# 30 seconds: ~2.7GB (safe)
# 5 minutes: ~27GB (borderline)
# 1 hour: ~324GB (requires streaming)
```
## Success Criteria ## Success Criteria
### Technical Feasibility ### Technical Feasibility

View File

@@ -37,6 +37,8 @@ class OutputConfig:
format: str = "alpha" format: str = "alpha"
background_color: List[int] = None background_color: List[int] = None
maintain_sbs: bool = True maintain_sbs: bool = True
preserve_audio: bool = True
verify_sync: bool = True
def __post_init__(self): def __post_init__(self):
if self.background_color is None: if self.background_color is None:
@@ -99,7 +101,9 @@ class VR180Config:
'path': self.output.path, 'path': self.output.path,
'format': self.output.format, 'format': self.output.format,
'background_color': self.output.background_color, 'background_color': self.output.background_color,
'maintain_sbs': self.output.maintain_sbs 'maintain_sbs': self.output.maintain_sbs,
'preserve_audio': self.output.preserve_audio,
'verify_sync': self.output.verify_sync
}, },
'hardware': { 'hardware': {
'device': self.hardware.device, 'device': self.hardware.device,

View File

@@ -5,6 +5,8 @@ import cv2
from pathlib import Path from pathlib import Path
import warnings import warnings
import os import os
import tempfile
import shutil
try: try:
from sam2.build_sam import build_sam2_video_predictor from sam2.build_sam import build_sam2_video_predictor
@@ -33,6 +35,7 @@ class SAM2VideoMatting:
self.predictor = None self.predictor = None
self.inference_state = None self.inference_state = None
self.video_segments = {} self.video_segments = {}
self.temp_video_path = None
self._load_model(model_cfg, checkpoint_path) self._load_model(model_cfg, checkpoint_path)
@@ -57,34 +60,58 @@ class SAM2VideoMatting:
if sam2_repo_path.exists(): if sam2_repo_path.exists():
checkpoint_path = str(sam2_repo_path) checkpoint_path = str(sam2_repo_path)
# Use SAM2's build_sam2_video_predictor which returns the predictor directly
# The predictor IS the model - no .model attribute needed
self.predictor = build_sam2_video_predictor( self.predictor = build_sam2_video_predictor(
model_cfg, config_file=model_cfg,
checkpoint_path, ckpt_path=checkpoint_path,
device=self.device device=self.device
) )
# Enable memory optimizations
if self.memory_offload:
self.predictor.fill_hole_area = 8
if self.fp16 and self.device == "cuda":
self.predictor.model.half()
except Exception as e: except Exception as e:
raise RuntimeError(f"Failed to load SAM2 model: {e}") raise RuntimeError(f"Failed to load SAM2 model: {e}")
def init_video_state(self, video_frames: List[np.ndarray]) -> None: def init_video_state(self, video_frames: List[np.ndarray] = None, video_path: str = None) -> None:
"""Initialize video inference state""" """Initialize video inference state"""
if self.predictor is None: if self.predictor is None:
raise RuntimeError("SAM2 model not loaded") raise RuntimeError("SAM2 model not loaded")
# Create temporary directory for frames if needed if video_path is not None:
# Use video path directly (SAM2's preferred method)
self.inference_state = self.predictor.init_state( self.inference_state = self.predictor.init_state(
video_path=None, video_path=video_path,
video_frames=video_frames,
offload_video_to_cpu=self.memory_offload, offload_video_to_cpu=self.memory_offload,
async_loading_frames=True async_loading_frames=True
) )
else:
# For frame arrays, we need to save them as a temporary video first
if video_frames is None or len(video_frames) == 0:
raise ValueError("Either video_path or video_frames must be provided")
# Create temporary video file in current directory
import uuid
temp_video_name = f"temp_sam2_{uuid.uuid4().hex[:8]}.mp4"
temp_video_path = Path.cwd() / temp_video_name
# Write frames to temporary video
height, width = video_frames[0].shape[:2]
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
writer = cv2.VideoWriter(str(temp_video_path), fourcc, 30.0, (width, height))
for frame in video_frames:
writer.write(frame)
writer.release()
# Initialize with temporary video
self.inference_state = self.predictor.init_state(
video_path=str(temp_video_path),
offload_video_to_cpu=self.memory_offload,
async_loading_frames=True
)
# Store temp path for cleanup
self.temp_video_path = temp_video_path
def add_person_prompts(self, def add_person_prompts(self,
frame_idx: int, frame_idx: int,
@@ -236,6 +263,16 @@ class SAM2VideoMatting:
self.inference_state = None self.inference_state = None
# Clean up temporary video file
if self.temp_video_path is not None:
try:
if self.temp_video_path.exists():
# Remove the temporary video file
self.temp_video_path.unlink()
self.temp_video_path = None
except Exception as e:
warnings.warn(f"Failed to cleanup temp video: {e}")
# Clear CUDA cache # Clear CUDA cache
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.empty_cache() torch.cuda.empty_cache()

View File

@@ -7,6 +7,12 @@ import tempfile
import shutil import shutil
from tqdm import tqdm from tqdm import tqdm
import warnings import warnings
import time
import subprocess
import gc
import psutil
import os
import sys
from .config import VR180Config from .config import VR180Config
from .detector import YOLODetector from .detector import YOLODetector
@@ -35,8 +41,117 @@ class VideoProcessor:
self.frame_width = 0 self.frame_width = 0
self.frame_height = 0 self.frame_height = 0
# Processing statistics
self.processing_stats = {
'start_time': None,
'end_time': None,
'total_duration': 0,
'processing_fps': 0,
'chunks_processed': 0,
'frames_processed': 0
}
self._initialize_models() self._initialize_models()
def _get_process_memory_info(self) -> Dict[str, float]:
"""Get detailed memory usage for current process and children"""
current_process = psutil.Process(os.getpid())
# Get memory info for current process
memory_info = current_process.memory_info()
current_rss = memory_info.rss / 1024**3 # Convert to GB
current_vms = memory_info.vms / 1024**3 # Virtual memory
# Get memory info for all children
children_rss = 0
children_vms = 0
child_count = 0
try:
for child in current_process.children(recursive=True):
try:
child_memory = child.memory_info()
children_rss += child_memory.rss / 1024**3
children_vms += child_memory.vms / 1024**3
child_count += 1
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
except psutil.NoSuchProcess:
pass
# System memory info
system_memory = psutil.virtual_memory()
system_total = system_memory.total / 1024**3
system_available = system_memory.available / 1024**3
system_used = system_memory.used / 1024**3
system_percent = system_memory.percent
return {
'process_rss_gb': current_rss,
'process_vms_gb': current_vms,
'children_rss_gb': children_rss,
'children_vms_gb': children_vms,
'total_process_gb': current_rss + children_rss,
'child_count': child_count,
'system_total_gb': system_total,
'system_used_gb': system_used,
'system_available_gb': system_available,
'system_percent': system_percent
}
def _print_memory_step(self, step_name: str):
"""Print memory usage for a specific processing step"""
memory_info = self._get_process_memory_info()
print(f"\n📊 MEMORY: {step_name}")
print(f" Process RSS: {memory_info['process_rss_gb']:.2f} GB")
if memory_info['children_rss_gb'] > 0:
print(f" Children RSS: {memory_info['children_rss_gb']:.2f} GB ({memory_info['child_count']} processes)")
print(f" Total Process: {memory_info['total_process_gb']:.2f} GB")
print(f" System: {memory_info['system_used_gb']:.1f}/{memory_info['system_total_gb']:.1f} GB ({memory_info['system_percent']:.1f}%)")
print(f" Available: {memory_info['system_available_gb']:.1f} GB")
def _aggressive_memory_cleanup(self, step_name: str = ""):
"""Perform aggressive memory cleanup and report before/after"""
if step_name:
print(f"\n🧹 CLEANUP: Before {step_name}")
before_info = self._get_process_memory_info()
before_rss = before_info['total_process_gb']
# Multiple rounds of garbage collection
for i in range(3):
gc.collect()
# Clear torch cache if available
try:
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
except ImportError:
pass
# Force Linux to release memory back to OS
if sys.platform == 'linux':
try:
import ctypes
libc = ctypes.CDLL("libc.so.6")
libc.malloc_trim(0)
except Exception as e:
print(f" Warning: Could not trim memory: {e}")
# Brief pause to allow cleanup
time.sleep(0.1)
after_info = self._get_process_memory_info()
after_rss = after_info['total_process_gb']
freed_memory = before_rss - after_rss
if step_name:
print(f" Before: {before_rss:.2f} GB → After: {after_rss:.2f} GB")
print(f" Freed: {freed_memory:.2f} GB")
def _initialize_models(self): def _initialize_models(self):
"""Initialize YOLO detector and SAM2 model""" """Initialize YOLO detector and SAM2 model"""
print("Initializing models...") print("Initializing models...")
@@ -348,25 +463,109 @@ class VideoProcessor:
print(f"Saved {len(frames)} PNG frames to {output_dir}") print(f"Saved {len(frames)} PNG frames to {output_dir}")
def _save_mp4_video(self, frames: List[np.ndarray], output_path: str): def _save_mp4_video(self, frames: List[np.ndarray], output_path: str):
"""Save frames as MP4 video""" """Save frames as MP4 video with audio preservation"""
if not frames: if not frames:
return return
height, width = frames[0].shape[:2] output_path = Path(output_path)
temp_frames_dir = output_path.parent / f"temp_frames_{output_path.stem}"
temp_frames_dir.mkdir(exist_ok=True)
fourcc = cv2.VideoWriter_fourcc(*'mp4v') try:
writer = cv2.VideoWriter(output_path, fourcc, self.fps, (width, height)) # Save frames as images
print("Saving frames as images...")
for frame in tqdm(frames, desc="Writing video"): for i, frame in enumerate(tqdm(frames, desc="Saving frames")):
if frame.shape[2] == 4: # Convert RGBA to BGR if frame.shape[2] == 4: # Convert RGBA to BGR
frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2BGR) frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2BGR)
writer.write(frame)
writer.release() frame_path = temp_frames_dir / f"frame_{i:06d}.jpg"
cv2.imwrite(str(frame_path), frame, [cv2.IMWRITE_JPEG_QUALITY, 95])
# Create video with ffmpeg
self._create_video_with_ffmpeg(temp_frames_dir, output_path, len(frames))
finally:
# Cleanup temporary frames
if temp_frames_dir.exists():
shutil.rmtree(temp_frames_dir)
def _create_video_with_ffmpeg(self, frames_dir: Path, output_path: Path, frame_count: int):
"""Create video using ffmpeg with audio preservation"""
frame_pattern = str(frames_dir / "frame_%06d.jpg")
if self.config.output.preserve_audio:
# Create video with audio from input
cmd = [
'ffmpeg', '-y',
'-framerate', str(self.fps),
'-i', frame_pattern,
'-i', str(self.config.input.video_path), # Input video for audio
'-c:v', 'h264_nvenc', # Try GPU encoding first
'-preset', 'fast',
'-cq', '18',
'-c:a', 'copy', # Copy audio without re-encoding
'-map', '0:v:0', # Map video from frames
'-map', '1:a:0', # Map audio from input video
'-shortest', # Match shortest stream duration
'-pix_fmt', 'yuv420p',
str(output_path)
]
else:
# Create video without audio
cmd = [
'ffmpeg', '-y',
'-framerate', str(self.fps),
'-i', frame_pattern,
'-c:v', 'h264_nvenc',
'-preset', 'fast',
'-cq', '18',
'-pix_fmt', 'yuv420p',
str(output_path)
]
print(f"Creating video with ffmpeg...")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
# Try CPU encoding as fallback
print("GPU encoding failed, trying CPU encoding...")
cmd[cmd.index('h264_nvenc')] = 'libx264'
cmd[cmd.index('-cq')] = '-crf' # Change quality parameter for CPU
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"FFmpeg stdout: {result.stdout}")
print(f"FFmpeg stderr: {result.stderr}")
raise RuntimeError(f"FFmpeg failed with return code {result.returncode}")
# Verify frame count if sync verification is enabled
if self.config.output.verify_sync:
self._verify_frame_count(output_path, frame_count)
print(f"Saved video to {output_path}") print(f"Saved video to {output_path}")
def _verify_frame_count(self, video_path: Path, expected_frames: int):
"""Verify output video has correct frame count"""
try:
probe = ffmpeg.probe(str(video_path))
video_stream = next(
(stream for stream in probe['streams'] if stream['codec_type'] == 'video'),
None
)
if video_stream:
actual_frames = int(video_stream.get('nb_frames', 0))
if actual_frames != expected_frames:
print(f"⚠️ Frame count mismatch: expected {expected_frames}, got {actual_frames}")
else:
print(f"✅ Frame count verified: {actual_frames} frames")
except Exception as e:
print(f"⚠️ Could not verify frame count: {e}")
def process_video(self) -> None: def process_video(self) -> None:
"""Main video processing pipeline""" """Main video processing pipeline"""
self.processing_stats['start_time'] = time.time()
print("Starting VR180 video processing...") print("Starting VR180 video processing...")
# Load video info # Load video info
@@ -376,13 +575,15 @@ class VideoProcessor:
chunk_size, overlap_frames = self.calculate_optimal_chunking() chunk_size, overlap_frames = self.calculate_optimal_chunking()
# Process video in chunks # Process video in chunks
chunk_results = [] chunk_files = [] # Store file paths instead of frame data
temp_chunk_dir = Path(tempfile.mkdtemp(prefix="vr180_chunks_"))
try:
for start_frame in range(0, self.total_frames, chunk_size - overlap_frames): for start_frame in range(0, self.total_frames, chunk_size - overlap_frames):
end_frame = min(start_frame + chunk_size, self.total_frames) end_frame = min(start_frame + chunk_size, self.total_frames)
frames_to_read = end_frame - start_frame frames_to_read = end_frame - start_frame
chunk_idx = len(chunk_results) chunk_idx = len(chunk_files)
print(f"\nProcessing chunk {chunk_idx}: frames {start_frame}-{end_frame}") print(f"\nProcessing chunk {chunk_idx}: frames {start_frame}-{end_frame}")
# Read chunk frames # Read chunk frames
@@ -395,23 +596,93 @@ class VideoProcessor:
# Process chunk # Process chunk
matted_frames = self.process_chunk(frames, chunk_idx) matted_frames = self.process_chunk(frames, chunk_idx)
chunk_results.append(matted_frames)
# Memory cleanup # Save chunk to disk immediately to free memory
chunk_path = temp_chunk_dir / f"chunk_{chunk_idx:04d}.npz"
print(f"Saving chunk {chunk_idx} to disk...")
np.savez_compressed(str(chunk_path), frames=matted_frames)
chunk_files.append(chunk_path)
# Free the frames from memory immediately
del matted_frames
del frames
# Update statistics
self.processing_stats['chunks_processed'] += 1
self.processing_stats['frames_processed'] += frames_to_read
# Aggressive memory cleanup after each chunk
self._aggressive_memory_cleanup(f"chunk {chunk_idx} completion")
# Also use memory manager cleanup
self.memory_manager.cleanup_memory() self.memory_manager.cleanup_memory()
if self.memory_manager.should_emergency_cleanup(): if self.memory_manager.should_emergency_cleanup():
self.memory_manager.emergency_cleanup() self.memory_manager.emergency_cleanup()
# Merge chunks if multiple # Load and merge chunks from disk
print("\nMerging chunks...") print("\nLoading and merging chunks...")
chunk_results = []
for chunk_file in chunk_files:
print(f"Loading {chunk_file.name}...")
chunk_data = np.load(str(chunk_file))
chunk_results.append(chunk_data['frames'])
chunk_data.close() # Close the file
# Merge chunks
final_frames = self.merge_overlapping_chunks(chunk_results, overlap_frames) final_frames = self.merge_overlapping_chunks(chunk_results, overlap_frames)
# Free chunk results after merging
del chunk_results
self._aggressive_memory_cleanup("after merging chunks")
# Save results # Save results
print(f"Saving {len(final_frames)} processed frames...") print(f"Saving {len(final_frames)} processed frames...")
self.save_video(final_frames, self.config.output.path) self.save_video(final_frames, self.config.output.path)
# Calculate final statistics
self.processing_stats['end_time'] = time.time()
self.processing_stats['total_duration'] = self.processing_stats['end_time'] - self.processing_stats['start_time']
if self.processing_stats['total_duration'] > 0:
self.processing_stats['processing_fps'] = self.processing_stats['frames_processed'] / self.processing_stats['total_duration']
# Print processing statistics
self._print_processing_statistics()
# Print final memory report # Print final memory report
self.memory_manager.print_memory_report() self.memory_manager.print_memory_report()
print("Video processing completed!") print("Video processing completed!")
finally:
# Clean up temporary chunk files
if temp_chunk_dir.exists():
print("Cleaning up temporary chunk files...")
shutil.rmtree(temp_chunk_dir)
def _print_processing_statistics(self):
"""Print detailed processing statistics"""
stats = self.processing_stats
video_duration = self.total_frames / self.fps if self.fps > 0 else 0
print("\n" + "="*60)
print("PROCESSING STATISTICS")
print("="*60)
print(f"Input video duration: {video_duration:.1f} seconds ({self.total_frames} frames @ {self.fps:.2f} fps)")
print(f"Total processing time: {stats['total_duration']:.1f} seconds")
print(f"Processing speed: {stats['processing_fps']:.2f} fps")
print(f"Speedup factor: {self.fps / stats['processing_fps']:.1f}x slower than realtime")
print(f"Chunks processed: {stats['chunks_processed']}")
print(f"Frames processed: {stats['frames_processed']}")
if video_duration > 0:
efficiency = video_duration / stats['total_duration']
print(f"Processing efficiency: {efficiency:.3f} (1.0 = realtime)")
# Estimate time for different video lengths
print(f"\nEstimated processing times:")
print(f" 5 minutes: {(5 * 60) / efficiency / 60:.1f} minutes")
print(f" 30 minutes: {(30 * 60) / efficiency / 60:.1f} minutes")
print(f" 1 hour: {(60 * 60) / efficiency / 60:.1f} minutes")
print("="*60 + "\n")

View File

@@ -65,11 +65,25 @@ class VR180Processor(VideoProcessor):
Returns: Returns:
Tuple of (left_eye_frame, right_eye_frame) Tuple of (left_eye_frame, right_eye_frame)
""" """
if self.sbs_split_point == 0: # Always calculate split point based on current frame width
self.sbs_split_point = frame.shape[1] // 2 # This handles scaled frames correctly
frame_width = frame.shape[1]
current_split_point = frame_width // 2
left_eye = frame[:, :self.sbs_split_point] # Debug info on first use
right_eye = frame[:, self.sbs_split_point:] if self.sbs_split_point == 0:
print(f"Frame dimensions: {frame.shape[1]}x{frame.shape[0]}")
print(f"Split point: {current_split_point}")
self.sbs_split_point = current_split_point # Store for reference
left_eye = frame[:, :current_split_point]
right_eye = frame[:, current_split_point:]
# Validate both eyes have content
if left_eye.size == 0:
raise RuntimeError(f"Left eye frame is empty after split (frame width: {frame_width})")
if right_eye.size == 0:
raise RuntimeError(f"Right eye frame is empty after split (frame width: {frame_width})")
return left_eye, right_eye return left_eye, right_eye
@@ -113,8 +127,23 @@ class VR180Processor(VideoProcessor):
left_eye_frames = [] left_eye_frames = []
right_eye_frames = [] right_eye_frames = []
for frame in frames: for i, frame in enumerate(frames):
left, right = self.split_sbs_frame(frame) left, right = self.split_sbs_frame(frame)
# Debug: Check if frames are valid
if i == 0: # Only debug first frame
print(f"Original frame shape: {frame.shape}")
print(f"Left eye shape: {left.shape}")
print(f"Right eye shape: {right.shape}")
print(f"Left eye min/max: {left.min()}/{left.max()}")
print(f"Right eye min/max: {right.min()}/{right.max()}")
# Validate frames
if left.size == 0:
raise RuntimeError(f"Left eye frame {i} is empty")
if right.size == 0:
raise RuntimeError(f"Right eye frame {i} is empty")
left_eye_frames.append(left) left_eye_frames.append(left)
right_eye_frames.append(right) right_eye_frames.append(right)
@@ -150,16 +179,148 @@ class VR180Processor(VideoProcessor):
if not eye_frames: if not eye_frames:
return [] return []
# Initialize SAM2 with eye frames # Create a unique temporary video for this eye processing
self.sam2_model.init_video_state(eye_frames) import uuid
temp_video_name = f"temp_sam2_{eye_name}_chunk{chunk_idx}_{uuid.uuid4().hex[:8]}.mp4"
temp_video_path = Path.cwd() / temp_video_name
try:
# Use ffmpeg approach since OpenCV video writer is failing
height, width = eye_frames[0].shape[:2]
temp_video_path = temp_video_path.with_suffix('.mp4')
print(f"Creating temp video using ffmpeg: {temp_video_path}")
print(f"Video params: size=({width}, {height}), frames={len(eye_frames)}")
# Create a temporary directory for frame images
temp_frames_dir = temp_video_path.parent / f"frames_{temp_video_path.stem}"
temp_frames_dir.mkdir(exist_ok=True)
# Save frames as individual images (using JPEG for smaller file size)
print("Saving frames as images...")
for i, frame in enumerate(eye_frames):
# Check if frame is empty
if frame.size == 0:
raise RuntimeError(f"Frame {i} is empty (size=0)")
# Ensure frame is uint8
if frame.dtype != np.uint8:
frame = frame.astype(np.uint8)
# Debug first frame
if i == 0:
print(f"First frame to save: shape={frame.shape}, dtype={frame.dtype}, empty={frame.size == 0}")
# Use JPEG instead of PNG for smaller files (faster I/O, less disk space)
frame_path = temp_frames_dir / f"frame_{i:06d}.jpg"
# Use high quality JPEG to minimize compression artifacts
success = cv2.imwrite(str(frame_path), frame, [cv2.IMWRITE_JPEG_QUALITY, 95])
if not success:
print(f"Frame {i} details: shape={frame.shape}, dtype={frame.dtype}, size={frame.size}")
raise RuntimeError(f"Failed to save frame {i} as image")
if i % 50 == 0:
print(f"Saved {i}/{len(eye_frames)} frames")
# Force garbage collection every 100 frames to free memory
if i % 100 == 0:
import gc
gc.collect()
# Use ffmpeg to create video from images
import subprocess
# Use the original video's framerate - access through parent class
original_fps = self.fps if hasattr(self, 'fps') else 30.0
print(f"Using framerate: {original_fps} fps")
# Memory monitoring before ffmpeg
self._print_memory_step(f"Before ffmpeg encoding ({eye_name} eye)")
# Try GPU encoding first, fallback to CPU
gpu_cmd = [
'ffmpeg', '-y', # -y to overwrite output file
'-framerate', str(original_fps),
'-i', str(temp_frames_dir / 'frame_%06d.jpg'),
'-c:v', 'h264_nvenc', # NVIDIA GPU encoder
'-preset', 'fast', # GPU preset
'-cq', '18', # Quality for GPU encoding
'-pix_fmt', 'yuv420p',
str(temp_video_path)
]
cpu_cmd = [
'ffmpeg', '-y', # -y to overwrite output file
'-framerate', str(original_fps),
'-i', str(temp_frames_dir / 'frame_%06d.jpg'),
'-c:v', 'libx264', # CPU encoder
'-pix_fmt', 'yuv420p',
'-crf', '18', # Quality for CPU encoding
'-preset', 'medium',
str(temp_video_path)
]
# Try GPU first
print(f"Trying GPU encoding: {' '.join(gpu_cmd)}")
result = subprocess.run(gpu_cmd, capture_output=True, text=True)
if result.returncode != 0:
print("GPU encoding failed, trying CPU...")
print(f"GPU error: {result.stderr}")
ffmpeg_cmd = cpu_cmd
print(f"Using CPU encoding: {' '.join(ffmpeg_cmd)}")
result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
else:
print("GPU encoding successful!")
ffmpeg_cmd = gpu_cmd
print(f"Running ffmpeg: {' '.join(ffmpeg_cmd)}")
result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"FFmpeg stdout: {result.stdout}")
print(f"FFmpeg stderr: {result.stderr}")
raise RuntimeError(f"FFmpeg failed with return code {result.returncode}")
# Clean up frame images
import shutil
shutil.rmtree(temp_frames_dir)
print(f"Created temp video successfully")
# Memory monitoring after ffmpeg
self._print_memory_step(f"After ffmpeg encoding ({eye_name} eye)")
# Verify the file was created and has content
if not temp_video_path.exists():
raise RuntimeError(f"Temporary video file was not created: {temp_video_path}")
file_size = temp_video_path.stat().st_size
if file_size == 0:
raise RuntimeError(f"Temporary video file is empty: {temp_video_path}")
print(f"Created temp video {temp_video_path} ({file_size / 1024 / 1024:.1f} MB)")
# Memory monitoring and cleanup before SAM2 initialization
num_frames = len(eye_frames) # Store count before freeing
first_frame = eye_frames[0].copy() # Copy first frame for detection before freeing
self._print_memory_step(f"Before SAM2 init ({eye_name} eye, {num_frames} frames)")
# CRITICAL: Explicitly free eye_frames from memory before SAM2 loads the same video
# This prevents the OOM issue where both Python frames and SAM2 frames exist simultaneously
del eye_frames # Free the frames array
self._aggressive_memory_cleanup(f"SAM2 init for {eye_name} eye")
# Initialize SAM2 with video path
self._print_memory_step(f"Starting SAM2 init ({eye_name} eye)")
self.sam2_model.init_video_state(video_path=str(temp_video_path))
self._print_memory_step(f"SAM2 initialized ({eye_name} eye)")
# Detect persons in first frame # Detect persons in first frame
first_frame = eye_frames[0]
detections = self.detector.detect_persons(first_frame) detections = self.detector.detect_persons(first_frame)
if not detections: if not detections:
warnings.warn(f"No persons detected in {eye_name} eye, chunk {chunk_idx}") warnings.warn(f"No persons detected in {eye_name} eye, chunk {chunk_idx}")
return self._create_empty_masks(eye_frames) # Return empty masks for the number of frames
return self._create_empty_masks_from_count(num_frames, first_frame.shape)
print(f"Detected {len(detections)} persons in {eye_name} eye first frame") print(f"Detected {len(detections)} persons in {eye_name} eye first frame")
@@ -169,15 +330,33 @@ class VR180Processor(VideoProcessor):
# Add prompts # Add prompts
object_ids = self.sam2_model.add_person_prompts(0, box_prompts, labels) object_ids = self.sam2_model.add_person_prompts(0, box_prompts, labels)
# Propagate masks # Propagate masks (most expensive operation)
self._print_memory_step(f"Before SAM2 propagation ({eye_name} eye, {num_frames} frames)")
video_segments = self.sam2_model.propagate_masks( video_segments = self.sam2_model.propagate_masks(
start_frame=0, start_frame=0,
max_frames=len(eye_frames) max_frames=num_frames
) )
self._print_memory_step(f"After SAM2 propagation ({eye_name} eye)")
# Apply masks - need to reload frames from temp video since we freed the original frames
self._print_memory_step(f"Before reloading frames for mask application ({eye_name} eye)")
# Read frames back from the temp video for mask application
cap = cv2.VideoCapture(str(temp_video_path))
reloaded_frames = []
for frame_idx in range(num_frames):
ret, frame = cap.read()
if not ret:
break
reloaded_frames.append(frame)
cap.release()
self._print_memory_step(f"Reloaded {len(reloaded_frames)} frames for mask application")
# Apply masks # Apply masks
matted_frames = [] matted_frames = []
for frame_idx, frame in enumerate(eye_frames): for frame_idx, frame in enumerate(reloaded_frames):
if frame_idx in video_segments: if frame_idx in video_segments:
frame_masks = video_segments[frame_idx] frame_masks = video_segments[frame_idx]
combined_mask = self.sam2_model.get_combined_mask(frame_masks) combined_mask = self.sam2_model.get_combined_mask(frame_masks)
@@ -192,11 +371,23 @@ class VR180Processor(VideoProcessor):
matted_frames.append(matted_frame) matted_frames.append(matted_frame)
# Cleanup # Free reloaded frames
self.sam2_model.cleanup() del reloaded_frames
self._aggressive_memory_cleanup(f"After mask application ({eye_name} eye)")
return matted_frames return matted_frames
finally:
# Always cleanup
self.sam2_model.cleanup()
# Remove temporary video file
try:
if temp_video_path.exists():
temp_video_path.unlink()
except Exception as e:
warnings.warn(f"Failed to cleanup temp video {temp_video_path}: {e}")
def _process_eye_sequence_with_validation(self, def _process_eye_sequence_with_validation(self,
right_eye_frames: List[np.ndarray], right_eye_frames: List[np.ndarray],
left_eye_results: List[np.ndarray], left_eye_results: List[np.ndarray],
@@ -259,6 +450,20 @@ class VR180Processor(VideoProcessor):
return validated_frames return validated_frames
def _create_empty_masks_from_count(self, num_frames: int, frame_shape: tuple) -> List[np.ndarray]:
"""Create empty masks when no persons detected (without frame array)"""
empty_frames = []
for _ in range(num_frames):
if self.config.output.format == "alpha":
# Transparent output
output = np.zeros((frame_shape[0], frame_shape[1], 4), dtype=np.uint8)
else:
# Green screen background
output = np.full((frame_shape[0], frame_shape[1], 3),
self.config.output.background_color, dtype=np.uint8)
empty_frames.append(output)
return empty_frames
def _get_mask_area(self, frame: np.ndarray) -> float: def _get_mask_area(self, frame: np.ndarray) -> float:
"""Get mask area from processed frame""" """Get mask area from processed frame"""
if frame.shape[2] == 4: # Alpha channel if frame.shape[2] == 4: # Alpha channel