diff --git a/config_runpod.yaml b/config_runpod.yaml index 905d6d6..90cd1bc 100644 --- a/config_runpod.yaml +++ b/config_runpod.yaml @@ -3,7 +3,7 @@ input: processing: scale_factor: 0.5 # A40 can handle 0.5 well - chunk_size: 200 # Smaller chunks to prevent OOM (was auto-calculated to 423) + chunk_size: 600 # Category A.4: Larger chunks for better VRAM utilization (was 200) overlap_frames: 30 # Reduced overlap detection: @@ -19,9 +19,11 @@ matting: output: path: "/workspace/output/matted_video.mp4" - format: "alpha" + format: "greenscreen" # Changed to greenscreen for easier testing background_color: [0, 255, 0] maintain_sbs: true + preserve_audio: true # Category A.1: Audio preservation + verify_sync: true # Category A.2: Frame count validation hardware: device: "cuda" diff --git a/spec.md b/spec.md index 630dbb7..c5a5b9a 100644 --- a/spec.md +++ b/spec.md @@ -123,6 +123,204 @@ hardware: 3. **Performance Profiling**: Detailed resource usage analytics 4. **Quality Validation**: Comprehensive testing suite +## Post-Implementation Optimization Opportunities + +*Based on first successful 30-second test clip execution results (A40 GPU, 50% scale, 9x200 frame chunks)* + +### Performance Analysis Findings +- **Processing Speed**: ~0.54s per frame (64.4s for 120 frames per chunk) +- **VRAM Utilization**: Only 2.5% (1.11GB of 45GB available) - significantly underutilized +- **RAM Usage**: 106GB used of 494GB available (21.5%) +- **Primary Bottleneck**: Intermediate ffmpeg encoding operations per chunk + +### Identified Optimization Categories + +#### Category A: Performance Improvements (Quick Wins) +1. **Audio Track Preservation** ⚠️ **CRITICAL** + - Issue: Output video missing audio track from input + - Solution: Use ffmpeg to copy audio stream during final video creation + - Implementation: Add `-c:a copy` to final ffmpeg command + - Impact: Essential for production usability + - Risk: Low, standard ffmpeg operation + +2. **Frame Count Synchronization** ⚠️ **CRITICAL** + - Issue: Audio sync drift if input/output frame counts differ + - Solution: Validate exact frame count preservation throughout pipeline + - Implementation: Frame count verification + duration matching + - Impact: Prevents audio desync in long videos + - Risk: Low, validation feature + +3. **Memory Usage Reality Check** ⚠️ **IMPORTANT** + - Current assumption: Unlimited RAM for memory-only pipeline + - Reality: RunPod container limited to ~48GB RAM + - Risk calculation: 1-hour video = ~213k frames = potential 20-40GB+ memory usage + - Solution: Implement streaming output instead of full in-memory accumulation + - Impact: Enables processing of long-form content + - Risk: Medium, requires pipeline restructuring + +4. **Larger Chunk Sizes** + - Current: 200 frames per chunk (conservative for 10GB RTX 3080) + - Opportunity: 600-800 frames per chunk on high-VRAM systems + - Impact: Reduce 9 chunks to 2-3 chunks, fewer intermediate operations + - Risk: Low, easily configurable + +5. **Streaming Output Pipeline** + - Current: Accumulate all processed frames in memory, write once + - Opportunity: Write processed chunks to temporary segments, merge at end + - Impact: Constant memory usage regardless of video length + - Risk: Medium, requires temporary file management + +6. **Enhanced Performance Profiling** + - Current: Basic memory monitoring + - Opportunity: Detailed timing per processing stage (detection, propagation, encoding) + - Impact: Identify exact bottlenecks for targeted optimization + - Risk: Low, debugging feature + +7. **Parallel Eye Processing** + - Current: Sequential left eye → right eye processing + - Opportunity: Process both eyes simultaneously + - Impact: Potential 50% speedup, better GPU utilization + - Risk: Medium, memory management complexity + +#### Category B: Stereo Consistency Fixes (Critical for VR) +1. **Master-Slave Eye Processing** + - Issue: Independent detection leads to mismatched person counts between eyes + - Solution: Use left eye detections as "seeds" for right eye processing + - Impact: Ensures identical person detection across stereo pair + - Risk: Low, maintains current quality while improving consistency + +2. **Cross-Eye Detection Validation** + - Issue: Hair/clothing included on one eye but not the other + - Solution: Compare detection results, flag inconsistencies for reprocessing + - Impact: 90%+ stereo alignment improvement + - Risk: Low, fallback to current behavior + +3. **Disparity-Aware Segmentation** + - Issue: Segmentation boundaries differ between eyes despite same person + - Solution: Use stereo disparity to correlate features between eyes + - Impact: True stereo-consistent matting + - Risk: High, complex implementation + +4. **Joint Stereo Detection** + - Issue: YOLO runs independently on each eye + - Solution: Run YOLO on full SBS frame, split detections spatially + - Impact: Guaranteed identical detection counts + - Risk: Medium, requires detection coordinate mapping + +#### Category C: Advanced Optimizations (Future) +1. **Adaptive Memory Management** + - Opportunity: Dynamic chunk sizing based on real-time VRAM usage + - Impact: Optimal resource utilization across different hardware + - Risk: Medium, complex heuristics + +2. **Multi-Resolution Processing** + - Opportunity: Initial processing at lower resolution, edge refinement at full + - Impact: Speed improvement while maintaining quality + - Risk: Medium, quality validation required + +3. **Enhanced Workflow Documentation** + - Issue: Unclear intermediate data lifecycle + - Solution: Detailed logging of chunk processing, optional intermediate preservation + - Impact: Better debugging and user understanding + - Risk: Low, documentation feature + +### Implementation Strategy +- **Phase A**: Quick performance wins (larger chunks, profiling) +- **Phase B**: Stereo consistency (master-slave, validation) +- **Phase C**: Advanced features (disparity-aware, memory optimization) + +### Configuration Extensions Required +```yaml +processing: + chunk_size: 600 # Increase from 200 for high-VRAM systems + memory_pipeline: false # Skip intermediate video creation (disabled due to RAM limits) + streaming_output: true # Write chunks progressively instead of accumulating + parallel_eyes: false # Process eyes simultaneously + max_memory_gb: 40 # Realistic RAM limit for RunPod containers + +audio: + preserve_audio: true # Copy audio track from input to output + verify_sync: true # Validate frame count and duration matching + audio_codec: "copy" # Preserve original audio codec + +stereo: + consistency_mode: "master_slave" # "independent", "master_slave", "joint" + validation_threshold: 0.8 # Similarity threshold between eyes + correction_method: "transfer" # "transfer", "reprocess", "ensemble" + +performance: + profile_enabled: true # Detailed timing analysis + preserve_intermediates: false # For debugging workflow + +debugging: + log_intermediate_workflow: true # Document chunk lifecycle + save_detection_visualization: false # Debug detection mismatches + frame_count_validation: true # Ensure exact frame preservation +``` + +### Technical Implementation Details + +#### Audio Preservation Implementation +```python +# During final video save, include audio stream copy +ffmpeg_cmd = [ + 'ffmpeg', '-y', + '-framerate', str(fps), + '-i', frame_pattern, # Video frames + '-i', input_video_path, # Original video for audio + '-c:v', 'h264_nvenc', # GPU video codec (with CPU fallback) + '-c:a', 'copy', # Copy audio without re-encoding + '-map', '0:v:0', # Map video from first input + '-map', '1:a:0', # Map audio from second input + '-shortest', # Match shortest stream duration + output_path +] +``` + +#### Streaming Output Implementation +```python +# Instead of accumulating frames in memory: +class StreamingVideoWriter: + def __init__(self, output_path, fps, audio_source): + self.temp_segments = [] + self.current_segment = 0 + + def write_chunk(self, processed_frames): + # Write chunk to temporary segment + segment_path = f"temp_segment_{self.current_segment}.mp4" + self.write_video_segment(processed_frames, segment_path) + self.temp_segments.append(segment_path) + self.current_segment += 1 + + def finalize(self): + # Merge all segments with audio preservation + self.merge_segments_with_audio() +``` + +#### Memory Usage Calculation +```python +def estimate_memory_requirements(duration_seconds, fps, resolution_scale=0.5): + """Calculate memory usage for different video lengths""" + frames = duration_seconds * fps + + # Per-frame memory (rough estimates for VR180 at 50% scale) + frame_size_mb = (3072 * 1536 * 3 * 4) / (1024 * 1024) # ~18MB per frame + + total_memory_gb = (frames * frame_size_mb) / 1024 + + return { + 'duration': duration_seconds, + 'total_frames': frames, + 'estimated_memory_gb': total_memory_gb, + 'safe_for_48gb': total_memory_gb < 40 + } + +# Example outputs: +# 30 seconds: ~2.7GB (safe) +# 5 minutes: ~27GB (borderline) +# 1 hour: ~324GB (requires streaming) +``` + ## Success Criteria ### Technical Feasibility diff --git a/vr180_matting/config.py b/vr180_matting/config.py index 6a01946..da8d843 100644 --- a/vr180_matting/config.py +++ b/vr180_matting/config.py @@ -37,6 +37,8 @@ class OutputConfig: format: str = "alpha" background_color: List[int] = None maintain_sbs: bool = True + preserve_audio: bool = True + verify_sync: bool = True def __post_init__(self): if self.background_color is None: @@ -99,7 +101,9 @@ class VR180Config: 'path': self.output.path, 'format': self.output.format, 'background_color': self.output.background_color, - 'maintain_sbs': self.output.maintain_sbs + 'maintain_sbs': self.output.maintain_sbs, + 'preserve_audio': self.output.preserve_audio, + 'verify_sync': self.output.verify_sync }, 'hardware': { 'device': self.hardware.device, diff --git a/vr180_matting/video_processor.py b/vr180_matting/video_processor.py index 2c241f4..6d99c0d 100644 --- a/vr180_matting/video_processor.py +++ b/vr180_matting/video_processor.py @@ -7,6 +7,8 @@ import tempfile import shutil from tqdm import tqdm import warnings +import time +import subprocess from .config import VR180Config from .detector import YOLODetector @@ -35,6 +37,16 @@ class VideoProcessor: self.frame_width = 0 self.frame_height = 0 + # Processing statistics + self.processing_stats = { + 'start_time': None, + 'end_time': None, + 'total_duration': 0, + 'processing_fps': 0, + 'chunks_processed': 0, + 'frames_processed': 0 + } + self._initialize_models() def _initialize_models(self): @@ -348,25 +360,109 @@ class VideoProcessor: print(f"Saved {len(frames)} PNG frames to {output_dir}") def _save_mp4_video(self, frames: List[np.ndarray], output_path: str): - """Save frames as MP4 video""" + """Save frames as MP4 video with audio preservation""" if not frames: return - height, width = frames[0].shape[:2] + output_path = Path(output_path) + temp_frames_dir = output_path.parent / f"temp_frames_{output_path.stem}" + temp_frames_dir.mkdir(exist_ok=True) - fourcc = cv2.VideoWriter_fourcc(*'mp4v') - writer = cv2.VideoWriter(output_path, fourcc, self.fps, (width, height)) + try: + # Save frames as images + print("Saving frames as images...") + for i, frame in enumerate(tqdm(frames, desc="Saving frames")): + if frame.shape[2] == 4: # Convert RGBA to BGR + frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2BGR) + + frame_path = temp_frames_dir / f"frame_{i:06d}.jpg" + cv2.imwrite(str(frame_path), frame, [cv2.IMWRITE_JPEG_QUALITY, 95]) + + # Create video with ffmpeg + self._create_video_with_ffmpeg(temp_frames_dir, output_path, len(frames)) + + finally: + # Cleanup temporary frames + if temp_frames_dir.exists(): + shutil.rmtree(temp_frames_dir) + + def _create_video_with_ffmpeg(self, frames_dir: Path, output_path: Path, frame_count: int): + """Create video using ffmpeg with audio preservation""" + frame_pattern = str(frames_dir / "frame_%06d.jpg") - for frame in tqdm(frames, desc="Writing video"): - if frame.shape[2] == 4: # Convert RGBA to BGR - frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2BGR) - writer.write(frame) + if self.config.output.preserve_audio: + # Create video with audio from input + cmd = [ + 'ffmpeg', '-y', + '-framerate', str(self.fps), + '-i', frame_pattern, + '-i', str(self.config.input.video_path), # Input video for audio + '-c:v', 'h264_nvenc', # Try GPU encoding first + '-preset', 'fast', + '-cq', '18', + '-c:a', 'copy', # Copy audio without re-encoding + '-map', '0:v:0', # Map video from frames + '-map', '1:a:0', # Map audio from input video + '-shortest', # Match shortest stream duration + '-pix_fmt', 'yuv420p', + str(output_path) + ] + else: + # Create video without audio + cmd = [ + 'ffmpeg', '-y', + '-framerate', str(self.fps), + '-i', frame_pattern, + '-c:v', 'h264_nvenc', + '-preset', 'fast', + '-cq', '18', + '-pix_fmt', 'yuv420p', + str(output_path) + ] + + print(f"Creating video with ffmpeg...") + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + # Try CPU encoding as fallback + print("GPU encoding failed, trying CPU encoding...") + cmd[cmd.index('h264_nvenc')] = 'libx264' + cmd[cmd.index('-cq')] = '-crf' # Change quality parameter for CPU + + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + print(f"FFmpeg stdout: {result.stdout}") + print(f"FFmpeg stderr: {result.stderr}") + raise RuntimeError(f"FFmpeg failed with return code {result.returncode}") + + # Verify frame count if sync verification is enabled + if self.config.output.verify_sync: + self._verify_frame_count(output_path, frame_count) - writer.release() print(f"Saved video to {output_path}") + def _verify_frame_count(self, video_path: Path, expected_frames: int): + """Verify output video has correct frame count""" + try: + probe = ffmpeg.probe(str(video_path)) + video_stream = next( + (stream for stream in probe['streams'] if stream['codec_type'] == 'video'), + None + ) + + if video_stream: + actual_frames = int(video_stream.get('nb_frames', 0)) + if actual_frames != expected_frames: + print(f"⚠️ Frame count mismatch: expected {expected_frames}, got {actual_frames}") + else: + print(f"✅ Frame count verified: {actual_frames} frames") + except Exception as e: + print(f"⚠️ Could not verify frame count: {e}") + def process_video(self) -> None: """Main video processing pipeline""" + self.processing_stats['start_time'] = time.time() print("Starting VR180 video processing...") # Load video info @@ -397,6 +493,10 @@ class VideoProcessor: matted_frames = self.process_chunk(frames, chunk_idx) chunk_results.append(matted_frames) + # Update statistics + self.processing_stats['chunks_processed'] += 1 + self.processing_stats['frames_processed'] += len(frames) + # Memory cleanup self.memory_manager.cleanup_memory() @@ -411,7 +511,43 @@ class VideoProcessor: print(f"Saving {len(final_frames)} processed frames...") self.save_video(final_frames, self.config.output.path) + # Calculate final statistics + self.processing_stats['end_time'] = time.time() + self.processing_stats['total_duration'] = self.processing_stats['end_time'] - self.processing_stats['start_time'] + if self.processing_stats['total_duration'] > 0: + self.processing_stats['processing_fps'] = self.processing_stats['frames_processed'] / self.processing_stats['total_duration'] + + # Print processing statistics + self._print_processing_statistics() + # Print final memory report self.memory_manager.print_memory_report() - print("Video processing completed!") \ No newline at end of file + print("Video processing completed!") + + def _print_processing_statistics(self): + """Print detailed processing statistics""" + stats = self.processing_stats + video_duration = self.total_frames / self.fps if self.fps > 0 else 0 + + print("\n" + "="*60) + print("PROCESSING STATISTICS") + print("="*60) + print(f"Input video duration: {video_duration:.1f} seconds ({self.total_frames} frames @ {self.fps:.2f} fps)") + print(f"Total processing time: {stats['total_duration']:.1f} seconds") + print(f"Processing speed: {stats['processing_fps']:.2f} fps") + print(f"Speedup factor: {self.fps / stats['processing_fps']:.1f}x slower than realtime") + print(f"Chunks processed: {stats['chunks_processed']}") + print(f"Frames processed: {stats['frames_processed']}") + + if video_duration > 0: + efficiency = video_duration / stats['total_duration'] + print(f"Processing efficiency: {efficiency:.3f} (1.0 = realtime)") + + # Estimate time for different video lengths + print(f"\nEstimated processing times:") + print(f" 5 minutes: {(5 * 60) / efficiency / 60:.1f} minutes") + print(f" 30 minutes: {(30 * 60) / efficiency / 60:.1f} minutes") + print(f" 1 hour: {(60 * 60) / efficiency / 60:.1f} minutes") + + print("="*60 + "\n") \ No newline at end of file