Compare commits
15 Commits
e195d23584
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| ccc68a3895 | |||
| 463f881eaf | |||
| b642b562f0 | |||
| 40ae537f7a | |||
| 28aa663b7b | |||
| 0244ba5204 | |||
| 141302cccf | |||
| 6b0eb6104d | |||
| 0f8818259e | |||
| 86274ba04a | |||
| 99c4da83af | |||
| c4af7baf3d | |||
| 3e21fd8678 | |||
| d933d6b606 | |||
| 7852303b40 |
@@ -3,8 +3,8 @@ input:
|
||||
|
||||
processing:
|
||||
scale_factor: 0.5 # A40 can handle 0.5 well
|
||||
chunk_size: 0 # Auto-calculate based on A40's 48GB VRAM
|
||||
overlap_frames: 60
|
||||
chunk_size: 600 # Category A.4: Larger chunks for better VRAM utilization (was 200)
|
||||
overlap_frames: 30 # Reduced overlap
|
||||
|
||||
detection:
|
||||
confidence_threshold: 0.7
|
||||
@@ -19,9 +19,11 @@ matting:
|
||||
|
||||
output:
|
||||
path: "/workspace/output/matted_video.mp4"
|
||||
format: "alpha"
|
||||
format: "greenscreen" # Changed to greenscreen for easier testing
|
||||
background_color: [0, 255, 0]
|
||||
maintain_sbs: true
|
||||
preserve_audio: true # Category A.1: Audio preservation
|
||||
verify_sync: true # Category A.2: Frame count validation
|
||||
|
||||
hardware:
|
||||
device: "cuda"
|
||||
|
||||
@@ -8,4 +8,5 @@ ultralytics>=8.0.0
|
||||
# sam2>=1.0.0 # Install via git: pip install git+https://github.com/facebookresearch/segment-anything-2.git
|
||||
tqdm>=4.65.0
|
||||
psutil>=5.9.0
|
||||
ffmpeg-python>=0.2.0
|
||||
ffmpeg-python>=0.2.0
|
||||
decord>=0.6.0
|
||||
@@ -14,6 +14,10 @@ echo "🐍 Installing Python dependencies..."
|
||||
pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Install decord for SAM2 video loading
|
||||
echo "📹 Installing decord for video processing..."
|
||||
pip install decord
|
||||
|
||||
# Install SAM2 separately (not on PyPI)
|
||||
echo "🎯 Installing SAM2..."
|
||||
pip install git+https://github.com/facebookresearch/segment-anything-2.git
|
||||
|
||||
198
spec.md
198
spec.md
@@ -123,6 +123,204 @@ hardware:
|
||||
3. **Performance Profiling**: Detailed resource usage analytics
|
||||
4. **Quality Validation**: Comprehensive testing suite
|
||||
|
||||
## Post-Implementation Optimization Opportunities
|
||||
|
||||
*Based on first successful 30-second test clip execution results (A40 GPU, 50% scale, 9x200 frame chunks)*
|
||||
|
||||
### Performance Analysis Findings
|
||||
- **Processing Speed**: ~0.54s per frame (64.4s for 120 frames per chunk)
|
||||
- **VRAM Utilization**: Only 2.5% (1.11GB of 45GB available) - significantly underutilized
|
||||
- **RAM Usage**: 106GB used of 494GB available (21.5%)
|
||||
- **Primary Bottleneck**: Intermediate ffmpeg encoding operations per chunk
|
||||
|
||||
### Identified Optimization Categories
|
||||
|
||||
#### Category A: Performance Improvements (Quick Wins)
|
||||
1. **Audio Track Preservation** ⚠️ **CRITICAL**
|
||||
- Issue: Output video missing audio track from input
|
||||
- Solution: Use ffmpeg to copy audio stream during final video creation
|
||||
- Implementation: Add `-c:a copy` to final ffmpeg command
|
||||
- Impact: Essential for production usability
|
||||
- Risk: Low, standard ffmpeg operation
|
||||
|
||||
2. **Frame Count Synchronization** ⚠️ **CRITICAL**
|
||||
- Issue: Audio sync drift if input/output frame counts differ
|
||||
- Solution: Validate exact frame count preservation throughout pipeline
|
||||
- Implementation: Frame count verification + duration matching
|
||||
- Impact: Prevents audio desync in long videos
|
||||
- Risk: Low, validation feature
|
||||
|
||||
3. **Memory Usage Reality Check** ⚠️ **IMPORTANT**
|
||||
- Current assumption: Unlimited RAM for memory-only pipeline
|
||||
- Reality: RunPod container limited to ~48GB RAM
|
||||
- Risk calculation: 1-hour video = ~213k frames = potential 20-40GB+ memory usage
|
||||
- Solution: Implement streaming output instead of full in-memory accumulation
|
||||
- Impact: Enables processing of long-form content
|
||||
- Risk: Medium, requires pipeline restructuring
|
||||
|
||||
4. **Larger Chunk Sizes**
|
||||
- Current: 200 frames per chunk (conservative for 10GB RTX 3080)
|
||||
- Opportunity: 600-800 frames per chunk on high-VRAM systems
|
||||
- Impact: Reduce 9 chunks to 2-3 chunks, fewer intermediate operations
|
||||
- Risk: Low, easily configurable
|
||||
|
||||
5. **Streaming Output Pipeline**
|
||||
- Current: Accumulate all processed frames in memory, write once
|
||||
- Opportunity: Write processed chunks to temporary segments, merge at end
|
||||
- Impact: Constant memory usage regardless of video length
|
||||
- Risk: Medium, requires temporary file management
|
||||
|
||||
6. **Enhanced Performance Profiling**
|
||||
- Current: Basic memory monitoring
|
||||
- Opportunity: Detailed timing per processing stage (detection, propagation, encoding)
|
||||
- Impact: Identify exact bottlenecks for targeted optimization
|
||||
- Risk: Low, debugging feature
|
||||
|
||||
7. **Parallel Eye Processing**
|
||||
- Current: Sequential left eye → right eye processing
|
||||
- Opportunity: Process both eyes simultaneously
|
||||
- Impact: Potential 50% speedup, better GPU utilization
|
||||
- Risk: Medium, memory management complexity
|
||||
|
||||
#### Category B: Stereo Consistency Fixes (Critical for VR)
|
||||
1. **Master-Slave Eye Processing**
|
||||
- Issue: Independent detection leads to mismatched person counts between eyes
|
||||
- Solution: Use left eye detections as "seeds" for right eye processing
|
||||
- Impact: Ensures identical person detection across stereo pair
|
||||
- Risk: Low, maintains current quality while improving consistency
|
||||
|
||||
2. **Cross-Eye Detection Validation**
|
||||
- Issue: Hair/clothing included on one eye but not the other
|
||||
- Solution: Compare detection results, flag inconsistencies for reprocessing
|
||||
- Impact: 90%+ stereo alignment improvement
|
||||
- Risk: Low, fallback to current behavior
|
||||
|
||||
3. **Disparity-Aware Segmentation**
|
||||
- Issue: Segmentation boundaries differ between eyes despite same person
|
||||
- Solution: Use stereo disparity to correlate features between eyes
|
||||
- Impact: True stereo-consistent matting
|
||||
- Risk: High, complex implementation
|
||||
|
||||
4. **Joint Stereo Detection**
|
||||
- Issue: YOLO runs independently on each eye
|
||||
- Solution: Run YOLO on full SBS frame, split detections spatially
|
||||
- Impact: Guaranteed identical detection counts
|
||||
- Risk: Medium, requires detection coordinate mapping
|
||||
|
||||
#### Category C: Advanced Optimizations (Future)
|
||||
1. **Adaptive Memory Management**
|
||||
- Opportunity: Dynamic chunk sizing based on real-time VRAM usage
|
||||
- Impact: Optimal resource utilization across different hardware
|
||||
- Risk: Medium, complex heuristics
|
||||
|
||||
2. **Multi-Resolution Processing**
|
||||
- Opportunity: Initial processing at lower resolution, edge refinement at full
|
||||
- Impact: Speed improvement while maintaining quality
|
||||
- Risk: Medium, quality validation required
|
||||
|
||||
3. **Enhanced Workflow Documentation**
|
||||
- Issue: Unclear intermediate data lifecycle
|
||||
- Solution: Detailed logging of chunk processing, optional intermediate preservation
|
||||
- Impact: Better debugging and user understanding
|
||||
- Risk: Low, documentation feature
|
||||
|
||||
### Implementation Strategy
|
||||
- **Phase A**: Quick performance wins (larger chunks, profiling)
|
||||
- **Phase B**: Stereo consistency (master-slave, validation)
|
||||
- **Phase C**: Advanced features (disparity-aware, memory optimization)
|
||||
|
||||
### Configuration Extensions Required
|
||||
```yaml
|
||||
processing:
|
||||
chunk_size: 600 # Increase from 200 for high-VRAM systems
|
||||
memory_pipeline: false # Skip intermediate video creation (disabled due to RAM limits)
|
||||
streaming_output: true # Write chunks progressively instead of accumulating
|
||||
parallel_eyes: false # Process eyes simultaneously
|
||||
max_memory_gb: 40 # Realistic RAM limit for RunPod containers
|
||||
|
||||
audio:
|
||||
preserve_audio: true # Copy audio track from input to output
|
||||
verify_sync: true # Validate frame count and duration matching
|
||||
audio_codec: "copy" # Preserve original audio codec
|
||||
|
||||
stereo:
|
||||
consistency_mode: "master_slave" # "independent", "master_slave", "joint"
|
||||
validation_threshold: 0.8 # Similarity threshold between eyes
|
||||
correction_method: "transfer" # "transfer", "reprocess", "ensemble"
|
||||
|
||||
performance:
|
||||
profile_enabled: true # Detailed timing analysis
|
||||
preserve_intermediates: false # For debugging workflow
|
||||
|
||||
debugging:
|
||||
log_intermediate_workflow: true # Document chunk lifecycle
|
||||
save_detection_visualization: false # Debug detection mismatches
|
||||
frame_count_validation: true # Ensure exact frame preservation
|
||||
```
|
||||
|
||||
### Technical Implementation Details
|
||||
|
||||
#### Audio Preservation Implementation
|
||||
```python
|
||||
# During final video save, include audio stream copy
|
||||
ffmpeg_cmd = [
|
||||
'ffmpeg', '-y',
|
||||
'-framerate', str(fps),
|
||||
'-i', frame_pattern, # Video frames
|
||||
'-i', input_video_path, # Original video for audio
|
||||
'-c:v', 'h264_nvenc', # GPU video codec (with CPU fallback)
|
||||
'-c:a', 'copy', # Copy audio without re-encoding
|
||||
'-map', '0:v:0', # Map video from first input
|
||||
'-map', '1:a:0', # Map audio from second input
|
||||
'-shortest', # Match shortest stream duration
|
||||
output_path
|
||||
]
|
||||
```
|
||||
|
||||
#### Streaming Output Implementation
|
||||
```python
|
||||
# Instead of accumulating frames in memory:
|
||||
class StreamingVideoWriter:
|
||||
def __init__(self, output_path, fps, audio_source):
|
||||
self.temp_segments = []
|
||||
self.current_segment = 0
|
||||
|
||||
def write_chunk(self, processed_frames):
|
||||
# Write chunk to temporary segment
|
||||
segment_path = f"temp_segment_{self.current_segment}.mp4"
|
||||
self.write_video_segment(processed_frames, segment_path)
|
||||
self.temp_segments.append(segment_path)
|
||||
self.current_segment += 1
|
||||
|
||||
def finalize(self):
|
||||
# Merge all segments with audio preservation
|
||||
self.merge_segments_with_audio()
|
||||
```
|
||||
|
||||
#### Memory Usage Calculation
|
||||
```python
|
||||
def estimate_memory_requirements(duration_seconds, fps, resolution_scale=0.5):
|
||||
"""Calculate memory usage for different video lengths"""
|
||||
frames = duration_seconds * fps
|
||||
|
||||
# Per-frame memory (rough estimates for VR180 at 50% scale)
|
||||
frame_size_mb = (3072 * 1536 * 3 * 4) / (1024 * 1024) # ~18MB per frame
|
||||
|
||||
total_memory_gb = (frames * frame_size_mb) / 1024
|
||||
|
||||
return {
|
||||
'duration': duration_seconds,
|
||||
'total_frames': frames,
|
||||
'estimated_memory_gb': total_memory_gb,
|
||||
'safe_for_48gb': total_memory_gb < 40
|
||||
}
|
||||
|
||||
# Example outputs:
|
||||
# 30 seconds: ~2.7GB (safe)
|
||||
# 5 minutes: ~27GB (borderline)
|
||||
# 1 hour: ~324GB (requires streaming)
|
||||
```
|
||||
|
||||
## Success Criteria
|
||||
|
||||
### Technical Feasibility
|
||||
|
||||
@@ -37,6 +37,8 @@ class OutputConfig:
|
||||
format: str = "alpha"
|
||||
background_color: List[int] = None
|
||||
maintain_sbs: bool = True
|
||||
preserve_audio: bool = True
|
||||
verify_sync: bool = True
|
||||
|
||||
def __post_init__(self):
|
||||
if self.background_color is None:
|
||||
@@ -99,7 +101,9 @@ class VR180Config:
|
||||
'path': self.output.path,
|
||||
'format': self.output.format,
|
||||
'background_color': self.output.background_color,
|
||||
'maintain_sbs': self.output.maintain_sbs
|
||||
'maintain_sbs': self.output.maintain_sbs,
|
||||
'preserve_audio': self.output.preserve_audio,
|
||||
'verify_sync': self.output.verify_sync
|
||||
},
|
||||
'hardware': {
|
||||
'device': self.hardware.device,
|
||||
|
||||
@@ -5,6 +5,8 @@ import cv2
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
try:
|
||||
from sam2.build_sam import build_sam2_video_predictor
|
||||
@@ -33,6 +35,7 @@ class SAM2VideoMatting:
|
||||
self.predictor = None
|
||||
self.inference_state = None
|
||||
self.video_segments = {}
|
||||
self.temp_video_path = None
|
||||
|
||||
self._load_model(model_cfg, checkpoint_path)
|
||||
|
||||
@@ -57,36 +60,58 @@ class SAM2VideoMatting:
|
||||
if sam2_repo_path.exists():
|
||||
checkpoint_path = str(sam2_repo_path)
|
||||
|
||||
# Use the config path as-is (should be relative to SAM2 package)
|
||||
# Example: "configs/sam2.1/sam2.1_hiera_l.yaml"
|
||||
# Use SAM2's build_sam2_video_predictor which returns the predictor directly
|
||||
# The predictor IS the model - no .model attribute needed
|
||||
self.predictor = build_sam2_video_predictor(
|
||||
model_cfg,
|
||||
checkpoint_path,
|
||||
config_file=model_cfg,
|
||||
ckpt_path=checkpoint_path,
|
||||
device=self.device
|
||||
)
|
||||
|
||||
# Enable memory optimizations
|
||||
if self.memory_offload:
|
||||
self.predictor.fill_hole_area = 8
|
||||
|
||||
if self.fp16 and self.device == "cuda":
|
||||
self.predictor.model.half()
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to load SAM2 model: {e}")
|
||||
|
||||
def init_video_state(self, video_frames: List[np.ndarray]) -> None:
|
||||
def init_video_state(self, video_frames: List[np.ndarray] = None, video_path: str = None) -> None:
|
||||
"""Initialize video inference state"""
|
||||
if self.predictor is None:
|
||||
raise RuntimeError("SAM2 model not loaded")
|
||||
|
||||
# Create temporary directory for frames if needed
|
||||
self.inference_state = self.predictor.init_state(
|
||||
video_path=None,
|
||||
video_frames=video_frames,
|
||||
offload_video_to_cpu=self.memory_offload,
|
||||
async_loading_frames=True
|
||||
)
|
||||
if video_path is not None:
|
||||
# Use video path directly (SAM2's preferred method)
|
||||
self.inference_state = self.predictor.init_state(
|
||||
video_path=video_path,
|
||||
offload_video_to_cpu=self.memory_offload,
|
||||
async_loading_frames=True
|
||||
)
|
||||
else:
|
||||
# For frame arrays, we need to save them as a temporary video first
|
||||
|
||||
if video_frames is None or len(video_frames) == 0:
|
||||
raise ValueError("Either video_path or video_frames must be provided")
|
||||
|
||||
# Create temporary video file in current directory
|
||||
import uuid
|
||||
temp_video_name = f"temp_sam2_{uuid.uuid4().hex[:8]}.mp4"
|
||||
temp_video_path = Path.cwd() / temp_video_name
|
||||
|
||||
# Write frames to temporary video
|
||||
height, width = video_frames[0].shape[:2]
|
||||
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
||||
writer = cv2.VideoWriter(str(temp_video_path), fourcc, 30.0, (width, height))
|
||||
|
||||
for frame in video_frames:
|
||||
writer.write(frame)
|
||||
writer.release()
|
||||
|
||||
# Initialize with temporary video
|
||||
self.inference_state = self.predictor.init_state(
|
||||
video_path=str(temp_video_path),
|
||||
offload_video_to_cpu=self.memory_offload,
|
||||
async_loading_frames=True
|
||||
)
|
||||
|
||||
# Store temp path for cleanup
|
||||
self.temp_video_path = temp_video_path
|
||||
|
||||
def add_person_prompts(self,
|
||||
frame_idx: int,
|
||||
@@ -238,6 +263,16 @@ class SAM2VideoMatting:
|
||||
|
||||
self.inference_state = None
|
||||
|
||||
# Clean up temporary video file
|
||||
if self.temp_video_path is not None:
|
||||
try:
|
||||
if self.temp_video_path.exists():
|
||||
# Remove the temporary video file
|
||||
self.temp_video_path.unlink()
|
||||
self.temp_video_path = None
|
||||
except Exception as e:
|
||||
warnings.warn(f"Failed to cleanup temp video: {e}")
|
||||
|
||||
# Clear CUDA cache
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
@@ -7,6 +7,12 @@ import tempfile
|
||||
import shutil
|
||||
from tqdm import tqdm
|
||||
import warnings
|
||||
import time
|
||||
import subprocess
|
||||
import gc
|
||||
import psutil
|
||||
import os
|
||||
import sys
|
||||
|
||||
from .config import VR180Config
|
||||
from .detector import YOLODetector
|
||||
@@ -35,8 +41,117 @@ class VideoProcessor:
|
||||
self.frame_width = 0
|
||||
self.frame_height = 0
|
||||
|
||||
# Processing statistics
|
||||
self.processing_stats = {
|
||||
'start_time': None,
|
||||
'end_time': None,
|
||||
'total_duration': 0,
|
||||
'processing_fps': 0,
|
||||
'chunks_processed': 0,
|
||||
'frames_processed': 0
|
||||
}
|
||||
|
||||
self._initialize_models()
|
||||
|
||||
def _get_process_memory_info(self) -> Dict[str, float]:
|
||||
"""Get detailed memory usage for current process and children"""
|
||||
current_process = psutil.Process(os.getpid())
|
||||
|
||||
# Get memory info for current process
|
||||
memory_info = current_process.memory_info()
|
||||
current_rss = memory_info.rss / 1024**3 # Convert to GB
|
||||
current_vms = memory_info.vms / 1024**3 # Virtual memory
|
||||
|
||||
# Get memory info for all children
|
||||
children_rss = 0
|
||||
children_vms = 0
|
||||
child_count = 0
|
||||
|
||||
try:
|
||||
for child in current_process.children(recursive=True):
|
||||
try:
|
||||
child_memory = child.memory_info()
|
||||
children_rss += child_memory.rss / 1024**3
|
||||
children_vms += child_memory.vms / 1024**3
|
||||
child_count += 1
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
pass
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
# System memory info
|
||||
system_memory = psutil.virtual_memory()
|
||||
system_total = system_memory.total / 1024**3
|
||||
system_available = system_memory.available / 1024**3
|
||||
system_used = system_memory.used / 1024**3
|
||||
system_percent = system_memory.percent
|
||||
|
||||
return {
|
||||
'process_rss_gb': current_rss,
|
||||
'process_vms_gb': current_vms,
|
||||
'children_rss_gb': children_rss,
|
||||
'children_vms_gb': children_vms,
|
||||
'total_process_gb': current_rss + children_rss,
|
||||
'child_count': child_count,
|
||||
'system_total_gb': system_total,
|
||||
'system_used_gb': system_used,
|
||||
'system_available_gb': system_available,
|
||||
'system_percent': system_percent
|
||||
}
|
||||
|
||||
def _print_memory_step(self, step_name: str):
|
||||
"""Print memory usage for a specific processing step"""
|
||||
memory_info = self._get_process_memory_info()
|
||||
|
||||
print(f"\n📊 MEMORY: {step_name}")
|
||||
print(f" Process RSS: {memory_info['process_rss_gb']:.2f} GB")
|
||||
if memory_info['children_rss_gb'] > 0:
|
||||
print(f" Children RSS: {memory_info['children_rss_gb']:.2f} GB ({memory_info['child_count']} processes)")
|
||||
print(f" Total Process: {memory_info['total_process_gb']:.2f} GB")
|
||||
print(f" System: {memory_info['system_used_gb']:.1f}/{memory_info['system_total_gb']:.1f} GB ({memory_info['system_percent']:.1f}%)")
|
||||
print(f" Available: {memory_info['system_available_gb']:.1f} GB")
|
||||
|
||||
def _aggressive_memory_cleanup(self, step_name: str = ""):
|
||||
"""Perform aggressive memory cleanup and report before/after"""
|
||||
if step_name:
|
||||
print(f"\n🧹 CLEANUP: Before {step_name}")
|
||||
|
||||
before_info = self._get_process_memory_info()
|
||||
before_rss = before_info['total_process_gb']
|
||||
|
||||
# Multiple rounds of garbage collection
|
||||
for i in range(3):
|
||||
gc.collect()
|
||||
|
||||
# Clear torch cache if available
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.synchronize()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Force Linux to release memory back to OS
|
||||
if sys.platform == 'linux':
|
||||
try:
|
||||
import ctypes
|
||||
libc = ctypes.CDLL("libc.so.6")
|
||||
libc.malloc_trim(0)
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not trim memory: {e}")
|
||||
|
||||
# Brief pause to allow cleanup
|
||||
time.sleep(0.1)
|
||||
|
||||
after_info = self._get_process_memory_info()
|
||||
after_rss = after_info['total_process_gb']
|
||||
freed_memory = before_rss - after_rss
|
||||
|
||||
if step_name:
|
||||
print(f" Before: {before_rss:.2f} GB → After: {after_rss:.2f} GB")
|
||||
print(f" Freed: {freed_memory:.2f} GB")
|
||||
|
||||
def _initialize_models(self):
|
||||
"""Initialize YOLO detector and SAM2 model"""
|
||||
print("Initializing models...")
|
||||
@@ -348,25 +463,109 @@ class VideoProcessor:
|
||||
print(f"Saved {len(frames)} PNG frames to {output_dir}")
|
||||
|
||||
def _save_mp4_video(self, frames: List[np.ndarray], output_path: str):
|
||||
"""Save frames as MP4 video"""
|
||||
"""Save frames as MP4 video with audio preservation"""
|
||||
if not frames:
|
||||
return
|
||||
|
||||
height, width = frames[0].shape[:2]
|
||||
output_path = Path(output_path)
|
||||
temp_frames_dir = output_path.parent / f"temp_frames_{output_path.stem}"
|
||||
temp_frames_dir.mkdir(exist_ok=True)
|
||||
|
||||
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
||||
writer = cv2.VideoWriter(output_path, fourcc, self.fps, (width, height))
|
||||
try:
|
||||
# Save frames as images
|
||||
print("Saving frames as images...")
|
||||
for i, frame in enumerate(tqdm(frames, desc="Saving frames")):
|
||||
if frame.shape[2] == 4: # Convert RGBA to BGR
|
||||
frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2BGR)
|
||||
|
||||
frame_path = temp_frames_dir / f"frame_{i:06d}.jpg"
|
||||
cv2.imwrite(str(frame_path), frame, [cv2.IMWRITE_JPEG_QUALITY, 95])
|
||||
|
||||
# Create video with ffmpeg
|
||||
self._create_video_with_ffmpeg(temp_frames_dir, output_path, len(frames))
|
||||
|
||||
finally:
|
||||
# Cleanup temporary frames
|
||||
if temp_frames_dir.exists():
|
||||
shutil.rmtree(temp_frames_dir)
|
||||
|
||||
def _create_video_with_ffmpeg(self, frames_dir: Path, output_path: Path, frame_count: int):
|
||||
"""Create video using ffmpeg with audio preservation"""
|
||||
frame_pattern = str(frames_dir / "frame_%06d.jpg")
|
||||
|
||||
for frame in tqdm(frames, desc="Writing video"):
|
||||
if frame.shape[2] == 4: # Convert RGBA to BGR
|
||||
frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2BGR)
|
||||
writer.write(frame)
|
||||
if self.config.output.preserve_audio:
|
||||
# Create video with audio from input
|
||||
cmd = [
|
||||
'ffmpeg', '-y',
|
||||
'-framerate', str(self.fps),
|
||||
'-i', frame_pattern,
|
||||
'-i', str(self.config.input.video_path), # Input video for audio
|
||||
'-c:v', 'h264_nvenc', # Try GPU encoding first
|
||||
'-preset', 'fast',
|
||||
'-cq', '18',
|
||||
'-c:a', 'copy', # Copy audio without re-encoding
|
||||
'-map', '0:v:0', # Map video from frames
|
||||
'-map', '1:a:0', # Map audio from input video
|
||||
'-shortest', # Match shortest stream duration
|
||||
'-pix_fmt', 'yuv420p',
|
||||
str(output_path)
|
||||
]
|
||||
else:
|
||||
# Create video without audio
|
||||
cmd = [
|
||||
'ffmpeg', '-y',
|
||||
'-framerate', str(self.fps),
|
||||
'-i', frame_pattern,
|
||||
'-c:v', 'h264_nvenc',
|
||||
'-preset', 'fast',
|
||||
'-cq', '18',
|
||||
'-pix_fmt', 'yuv420p',
|
||||
str(output_path)
|
||||
]
|
||||
|
||||
print(f"Creating video with ffmpeg...")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
# Try CPU encoding as fallback
|
||||
print("GPU encoding failed, trying CPU encoding...")
|
||||
cmd[cmd.index('h264_nvenc')] = 'libx264'
|
||||
cmd[cmd.index('-cq')] = '-crf' # Change quality parameter for CPU
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"FFmpeg stdout: {result.stdout}")
|
||||
print(f"FFmpeg stderr: {result.stderr}")
|
||||
raise RuntimeError(f"FFmpeg failed with return code {result.returncode}")
|
||||
|
||||
# Verify frame count if sync verification is enabled
|
||||
if self.config.output.verify_sync:
|
||||
self._verify_frame_count(output_path, frame_count)
|
||||
|
||||
writer.release()
|
||||
print(f"Saved video to {output_path}")
|
||||
|
||||
def _verify_frame_count(self, video_path: Path, expected_frames: int):
|
||||
"""Verify output video has correct frame count"""
|
||||
try:
|
||||
probe = ffmpeg.probe(str(video_path))
|
||||
video_stream = next(
|
||||
(stream for stream in probe['streams'] if stream['codec_type'] == 'video'),
|
||||
None
|
||||
)
|
||||
|
||||
if video_stream:
|
||||
actual_frames = int(video_stream.get('nb_frames', 0))
|
||||
if actual_frames != expected_frames:
|
||||
print(f"⚠️ Frame count mismatch: expected {expected_frames}, got {actual_frames}")
|
||||
else:
|
||||
print(f"✅ Frame count verified: {actual_frames} frames")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Could not verify frame count: {e}")
|
||||
|
||||
def process_video(self) -> None:
|
||||
"""Main video processing pipeline"""
|
||||
self.processing_stats['start_time'] = time.time()
|
||||
print("Starting VR180 video processing...")
|
||||
|
||||
# Load video info
|
||||
@@ -376,42 +575,114 @@ class VideoProcessor:
|
||||
chunk_size, overlap_frames = self.calculate_optimal_chunking()
|
||||
|
||||
# Process video in chunks
|
||||
chunk_results = []
|
||||
chunk_files = [] # Store file paths instead of frame data
|
||||
temp_chunk_dir = Path(tempfile.mkdtemp(prefix="vr180_chunks_"))
|
||||
|
||||
for start_frame in range(0, self.total_frames, chunk_size - overlap_frames):
|
||||
end_frame = min(start_frame + chunk_size, self.total_frames)
|
||||
frames_to_read = end_frame - start_frame
|
||||
try:
|
||||
for start_frame in range(0, self.total_frames, chunk_size - overlap_frames):
|
||||
end_frame = min(start_frame + chunk_size, self.total_frames)
|
||||
frames_to_read = end_frame - start_frame
|
||||
|
||||
chunk_idx = len(chunk_files)
|
||||
print(f"\nProcessing chunk {chunk_idx}: frames {start_frame}-{end_frame}")
|
||||
|
||||
# Read chunk frames
|
||||
frames = self.read_video_frames(
|
||||
self.config.input.video_path,
|
||||
start_frame=start_frame,
|
||||
num_frames=frames_to_read,
|
||||
scale_factor=self.config.processing.scale_factor
|
||||
)
|
||||
|
||||
# Process chunk
|
||||
matted_frames = self.process_chunk(frames, chunk_idx)
|
||||
|
||||
# Save chunk to disk immediately to free memory
|
||||
chunk_path = temp_chunk_dir / f"chunk_{chunk_idx:04d}.npz"
|
||||
print(f"Saving chunk {chunk_idx} to disk...")
|
||||
np.savez_compressed(str(chunk_path), frames=matted_frames)
|
||||
chunk_files.append(chunk_path)
|
||||
|
||||
# Free the frames from memory immediately
|
||||
del matted_frames
|
||||
del frames
|
||||
|
||||
# Update statistics
|
||||
self.processing_stats['chunks_processed'] += 1
|
||||
self.processing_stats['frames_processed'] += frames_to_read
|
||||
|
||||
# Aggressive memory cleanup after each chunk
|
||||
self._aggressive_memory_cleanup(f"chunk {chunk_idx} completion")
|
||||
|
||||
# Also use memory manager cleanup
|
||||
self.memory_manager.cleanup_memory()
|
||||
|
||||
if self.memory_manager.should_emergency_cleanup():
|
||||
self.memory_manager.emergency_cleanup()
|
||||
|
||||
chunk_idx = len(chunk_results)
|
||||
print(f"\nProcessing chunk {chunk_idx}: frames {start_frame}-{end_frame}")
|
||||
# Load and merge chunks from disk
|
||||
print("\nLoading and merging chunks...")
|
||||
chunk_results = []
|
||||
for chunk_file in chunk_files:
|
||||
print(f"Loading {chunk_file.name}...")
|
||||
chunk_data = np.load(str(chunk_file))
|
||||
chunk_results.append(chunk_data['frames'])
|
||||
chunk_data.close() # Close the file
|
||||
|
||||
# Read chunk frames
|
||||
frames = self.read_video_frames(
|
||||
self.config.input.video_path,
|
||||
start_frame=start_frame,
|
||||
num_frames=frames_to_read,
|
||||
scale_factor=self.config.processing.scale_factor
|
||||
)
|
||||
# Merge chunks
|
||||
final_frames = self.merge_overlapping_chunks(chunk_results, overlap_frames)
|
||||
|
||||
# Process chunk
|
||||
matted_frames = self.process_chunk(frames, chunk_idx)
|
||||
chunk_results.append(matted_frames)
|
||||
# Free chunk results after merging
|
||||
del chunk_results
|
||||
self._aggressive_memory_cleanup("after merging chunks")
|
||||
|
||||
# Memory cleanup
|
||||
self.memory_manager.cleanup_memory()
|
||||
# Save results
|
||||
print(f"Saving {len(final_frames)} processed frames...")
|
||||
self.save_video(final_frames, self.config.output.path)
|
||||
|
||||
if self.memory_manager.should_emergency_cleanup():
|
||||
self.memory_manager.emergency_cleanup()
|
||||
# Calculate final statistics
|
||||
self.processing_stats['end_time'] = time.time()
|
||||
self.processing_stats['total_duration'] = self.processing_stats['end_time'] - self.processing_stats['start_time']
|
||||
if self.processing_stats['total_duration'] > 0:
|
||||
self.processing_stats['processing_fps'] = self.processing_stats['frames_processed'] / self.processing_stats['total_duration']
|
||||
|
||||
# Print processing statistics
|
||||
self._print_processing_statistics()
|
||||
|
||||
# Print final memory report
|
||||
self.memory_manager.print_memory_report()
|
||||
|
||||
print("Video processing completed!")
|
||||
|
||||
finally:
|
||||
# Clean up temporary chunk files
|
||||
if temp_chunk_dir.exists():
|
||||
print("Cleaning up temporary chunk files...")
|
||||
shutil.rmtree(temp_chunk_dir)
|
||||
|
||||
def _print_processing_statistics(self):
|
||||
"""Print detailed processing statistics"""
|
||||
stats = self.processing_stats
|
||||
video_duration = self.total_frames / self.fps if self.fps > 0 else 0
|
||||
|
||||
# Merge chunks if multiple
|
||||
print("\nMerging chunks...")
|
||||
final_frames = self.merge_overlapping_chunks(chunk_results, overlap_frames)
|
||||
print("\n" + "="*60)
|
||||
print("PROCESSING STATISTICS")
|
||||
print("="*60)
|
||||
print(f"Input video duration: {video_duration:.1f} seconds ({self.total_frames} frames @ {self.fps:.2f} fps)")
|
||||
print(f"Total processing time: {stats['total_duration']:.1f} seconds")
|
||||
print(f"Processing speed: {stats['processing_fps']:.2f} fps")
|
||||
print(f"Speedup factor: {self.fps / stats['processing_fps']:.1f}x slower than realtime")
|
||||
print(f"Chunks processed: {stats['chunks_processed']}")
|
||||
print(f"Frames processed: {stats['frames_processed']}")
|
||||
|
||||
# Save results
|
||||
print(f"Saving {len(final_frames)} processed frames...")
|
||||
self.save_video(final_frames, self.config.output.path)
|
||||
if video_duration > 0:
|
||||
efficiency = video_duration / stats['total_duration']
|
||||
print(f"Processing efficiency: {efficiency:.3f} (1.0 = realtime)")
|
||||
|
||||
# Estimate time for different video lengths
|
||||
print(f"\nEstimated processing times:")
|
||||
print(f" 5 minutes: {(5 * 60) / efficiency / 60:.1f} minutes")
|
||||
print(f" 30 minutes: {(30 * 60) / efficiency / 60:.1f} minutes")
|
||||
print(f" 1 hour: {(60 * 60) / efficiency / 60:.1f} minutes")
|
||||
|
||||
# Print final memory report
|
||||
self.memory_manager.print_memory_report()
|
||||
|
||||
print("Video processing completed!")
|
||||
print("="*60 + "\n")
|
||||
@@ -65,11 +65,25 @@ class VR180Processor(VideoProcessor):
|
||||
Returns:
|
||||
Tuple of (left_eye_frame, right_eye_frame)
|
||||
"""
|
||||
if self.sbs_split_point == 0:
|
||||
self.sbs_split_point = frame.shape[1] // 2
|
||||
# Always calculate split point based on current frame width
|
||||
# This handles scaled frames correctly
|
||||
frame_width = frame.shape[1]
|
||||
current_split_point = frame_width // 2
|
||||
|
||||
left_eye = frame[:, :self.sbs_split_point]
|
||||
right_eye = frame[:, self.sbs_split_point:]
|
||||
# Debug info on first use
|
||||
if self.sbs_split_point == 0:
|
||||
print(f"Frame dimensions: {frame.shape[1]}x{frame.shape[0]}")
|
||||
print(f"Split point: {current_split_point}")
|
||||
self.sbs_split_point = current_split_point # Store for reference
|
||||
|
||||
left_eye = frame[:, :current_split_point]
|
||||
right_eye = frame[:, current_split_point:]
|
||||
|
||||
# Validate both eyes have content
|
||||
if left_eye.size == 0:
|
||||
raise RuntimeError(f"Left eye frame is empty after split (frame width: {frame_width})")
|
||||
if right_eye.size == 0:
|
||||
raise RuntimeError(f"Right eye frame is empty after split (frame width: {frame_width})")
|
||||
|
||||
return left_eye, right_eye
|
||||
|
||||
@@ -113,8 +127,23 @@ class VR180Processor(VideoProcessor):
|
||||
left_eye_frames = []
|
||||
right_eye_frames = []
|
||||
|
||||
for frame in frames:
|
||||
for i, frame in enumerate(frames):
|
||||
left, right = self.split_sbs_frame(frame)
|
||||
|
||||
# Debug: Check if frames are valid
|
||||
if i == 0: # Only debug first frame
|
||||
print(f"Original frame shape: {frame.shape}")
|
||||
print(f"Left eye shape: {left.shape}")
|
||||
print(f"Right eye shape: {right.shape}")
|
||||
print(f"Left eye min/max: {left.min()}/{left.max()}")
|
||||
print(f"Right eye min/max: {right.min()}/{right.max()}")
|
||||
|
||||
# Validate frames
|
||||
if left.size == 0:
|
||||
raise RuntimeError(f"Left eye frame {i} is empty")
|
||||
if right.size == 0:
|
||||
raise RuntimeError(f"Right eye frame {i} is empty")
|
||||
|
||||
left_eye_frames.append(left)
|
||||
right_eye_frames.append(right)
|
||||
|
||||
@@ -150,52 +179,214 @@ class VR180Processor(VideoProcessor):
|
||||
if not eye_frames:
|
||||
return []
|
||||
|
||||
# Initialize SAM2 with eye frames
|
||||
self.sam2_model.init_video_state(eye_frames)
|
||||
# Create a unique temporary video for this eye processing
|
||||
import uuid
|
||||
temp_video_name = f"temp_sam2_{eye_name}_chunk{chunk_idx}_{uuid.uuid4().hex[:8]}.mp4"
|
||||
temp_video_path = Path.cwd() / temp_video_name
|
||||
|
||||
# Detect persons in first frame
|
||||
first_frame = eye_frames[0]
|
||||
detections = self.detector.detect_persons(first_frame)
|
||||
|
||||
if not detections:
|
||||
warnings.warn(f"No persons detected in {eye_name} eye, chunk {chunk_idx}")
|
||||
return self._create_empty_masks(eye_frames)
|
||||
|
||||
print(f"Detected {len(detections)} persons in {eye_name} eye first frame")
|
||||
|
||||
# Convert to SAM2 prompts
|
||||
box_prompts, labels = self.detector.convert_to_sam_prompts(detections)
|
||||
|
||||
# Add prompts
|
||||
object_ids = self.sam2_model.add_person_prompts(0, box_prompts, labels)
|
||||
|
||||
# Propagate masks
|
||||
video_segments = self.sam2_model.propagate_masks(
|
||||
start_frame=0,
|
||||
max_frames=len(eye_frames)
|
||||
)
|
||||
|
||||
# Apply masks
|
||||
matted_frames = []
|
||||
for frame_idx, frame in enumerate(eye_frames):
|
||||
if frame_idx in video_segments:
|
||||
frame_masks = video_segments[frame_idx]
|
||||
combined_mask = self.sam2_model.get_combined_mask(frame_masks)
|
||||
|
||||
matted_frame = self.sam2_model.apply_mask_to_frame(
|
||||
frame, combined_mask,
|
||||
output_format=self.config.output.format,
|
||||
background_color=self.config.output.background_color
|
||||
)
|
||||
else:
|
||||
matted_frame = self._create_empty_mask_frame(frame)
|
||||
try:
|
||||
# Use ffmpeg approach since OpenCV video writer is failing
|
||||
height, width = eye_frames[0].shape[:2]
|
||||
temp_video_path = temp_video_path.with_suffix('.mp4')
|
||||
|
||||
matted_frames.append(matted_frame)
|
||||
|
||||
# Cleanup
|
||||
self.sam2_model.cleanup()
|
||||
|
||||
return matted_frames
|
||||
print(f"Creating temp video using ffmpeg: {temp_video_path}")
|
||||
print(f"Video params: size=({width}, {height}), frames={len(eye_frames)}")
|
||||
|
||||
# Create a temporary directory for frame images
|
||||
temp_frames_dir = temp_video_path.parent / f"frames_{temp_video_path.stem}"
|
||||
temp_frames_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Save frames as individual images (using JPEG for smaller file size)
|
||||
print("Saving frames as images...")
|
||||
for i, frame in enumerate(eye_frames):
|
||||
# Check if frame is empty
|
||||
if frame.size == 0:
|
||||
raise RuntimeError(f"Frame {i} is empty (size=0)")
|
||||
|
||||
# Ensure frame is uint8
|
||||
if frame.dtype != np.uint8:
|
||||
frame = frame.astype(np.uint8)
|
||||
|
||||
# Debug first frame
|
||||
if i == 0:
|
||||
print(f"First frame to save: shape={frame.shape}, dtype={frame.dtype}, empty={frame.size == 0}")
|
||||
|
||||
# Use JPEG instead of PNG for smaller files (faster I/O, less disk space)
|
||||
frame_path = temp_frames_dir / f"frame_{i:06d}.jpg"
|
||||
# Use high quality JPEG to minimize compression artifacts
|
||||
success = cv2.imwrite(str(frame_path), frame, [cv2.IMWRITE_JPEG_QUALITY, 95])
|
||||
if not success:
|
||||
print(f"Frame {i} details: shape={frame.shape}, dtype={frame.dtype}, size={frame.size}")
|
||||
raise RuntimeError(f"Failed to save frame {i} as image")
|
||||
|
||||
if i % 50 == 0:
|
||||
print(f"Saved {i}/{len(eye_frames)} frames")
|
||||
|
||||
# Force garbage collection every 100 frames to free memory
|
||||
if i % 100 == 0:
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
# Use ffmpeg to create video from images
|
||||
import subprocess
|
||||
# Use the original video's framerate - access through parent class
|
||||
original_fps = self.fps if hasattr(self, 'fps') else 30.0
|
||||
print(f"Using framerate: {original_fps} fps")
|
||||
|
||||
# Memory monitoring before ffmpeg
|
||||
self._print_memory_step(f"Before ffmpeg encoding ({eye_name} eye)")
|
||||
# Try GPU encoding first, fallback to CPU
|
||||
gpu_cmd = [
|
||||
'ffmpeg', '-y', # -y to overwrite output file
|
||||
'-framerate', str(original_fps),
|
||||
'-i', str(temp_frames_dir / 'frame_%06d.jpg'),
|
||||
'-c:v', 'h264_nvenc', # NVIDIA GPU encoder
|
||||
'-preset', 'fast', # GPU preset
|
||||
'-cq', '18', # Quality for GPU encoding
|
||||
'-pix_fmt', 'yuv420p',
|
||||
str(temp_video_path)
|
||||
]
|
||||
|
||||
cpu_cmd = [
|
||||
'ffmpeg', '-y', # -y to overwrite output file
|
||||
'-framerate', str(original_fps),
|
||||
'-i', str(temp_frames_dir / 'frame_%06d.jpg'),
|
||||
'-c:v', 'libx264', # CPU encoder
|
||||
'-pix_fmt', 'yuv420p',
|
||||
'-crf', '18', # Quality for CPU encoding
|
||||
'-preset', 'medium',
|
||||
str(temp_video_path)
|
||||
]
|
||||
|
||||
# Try GPU first
|
||||
print(f"Trying GPU encoding: {' '.join(gpu_cmd)}")
|
||||
result = subprocess.run(gpu_cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print("GPU encoding failed, trying CPU...")
|
||||
print(f"GPU error: {result.stderr}")
|
||||
ffmpeg_cmd = cpu_cmd
|
||||
print(f"Using CPU encoding: {' '.join(ffmpeg_cmd)}")
|
||||
result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
|
||||
else:
|
||||
print("GPU encoding successful!")
|
||||
ffmpeg_cmd = gpu_cmd
|
||||
|
||||
print(f"Running ffmpeg: {' '.join(ffmpeg_cmd)}")
|
||||
result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"FFmpeg stdout: {result.stdout}")
|
||||
print(f"FFmpeg stderr: {result.stderr}")
|
||||
raise RuntimeError(f"FFmpeg failed with return code {result.returncode}")
|
||||
|
||||
# Clean up frame images
|
||||
import shutil
|
||||
shutil.rmtree(temp_frames_dir)
|
||||
|
||||
print(f"Created temp video successfully")
|
||||
|
||||
# Memory monitoring after ffmpeg
|
||||
self._print_memory_step(f"After ffmpeg encoding ({eye_name} eye)")
|
||||
|
||||
# Verify the file was created and has content
|
||||
if not temp_video_path.exists():
|
||||
raise RuntimeError(f"Temporary video file was not created: {temp_video_path}")
|
||||
|
||||
file_size = temp_video_path.stat().st_size
|
||||
if file_size == 0:
|
||||
raise RuntimeError(f"Temporary video file is empty: {temp_video_path}")
|
||||
|
||||
print(f"Created temp video {temp_video_path} ({file_size / 1024 / 1024:.1f} MB)")
|
||||
|
||||
# Memory monitoring and cleanup before SAM2 initialization
|
||||
num_frames = len(eye_frames) # Store count before freeing
|
||||
first_frame = eye_frames[0].copy() # Copy first frame for detection before freeing
|
||||
self._print_memory_step(f"Before SAM2 init ({eye_name} eye, {num_frames} frames)")
|
||||
|
||||
# CRITICAL: Explicitly free eye_frames from memory before SAM2 loads the same video
|
||||
# This prevents the OOM issue where both Python frames and SAM2 frames exist simultaneously
|
||||
del eye_frames # Free the frames array
|
||||
self._aggressive_memory_cleanup(f"SAM2 init for {eye_name} eye")
|
||||
|
||||
# Initialize SAM2 with video path
|
||||
self._print_memory_step(f"Starting SAM2 init ({eye_name} eye)")
|
||||
self.sam2_model.init_video_state(video_path=str(temp_video_path))
|
||||
self._print_memory_step(f"SAM2 initialized ({eye_name} eye)")
|
||||
|
||||
# Detect persons in first frame
|
||||
detections = self.detector.detect_persons(first_frame)
|
||||
|
||||
if not detections:
|
||||
warnings.warn(f"No persons detected in {eye_name} eye, chunk {chunk_idx}")
|
||||
# Return empty masks for the number of frames
|
||||
return self._create_empty_masks_from_count(num_frames, first_frame.shape)
|
||||
|
||||
print(f"Detected {len(detections)} persons in {eye_name} eye first frame")
|
||||
|
||||
# Convert to SAM2 prompts
|
||||
box_prompts, labels = self.detector.convert_to_sam_prompts(detections)
|
||||
|
||||
# Add prompts
|
||||
object_ids = self.sam2_model.add_person_prompts(0, box_prompts, labels)
|
||||
|
||||
# Propagate masks (most expensive operation)
|
||||
self._print_memory_step(f"Before SAM2 propagation ({eye_name} eye, {num_frames} frames)")
|
||||
video_segments = self.sam2_model.propagate_masks(
|
||||
start_frame=0,
|
||||
max_frames=num_frames
|
||||
)
|
||||
self._print_memory_step(f"After SAM2 propagation ({eye_name} eye)")
|
||||
|
||||
# Apply masks - need to reload frames from temp video since we freed the original frames
|
||||
self._print_memory_step(f"Before reloading frames for mask application ({eye_name} eye)")
|
||||
|
||||
# Read frames back from the temp video for mask application
|
||||
cap = cv2.VideoCapture(str(temp_video_path))
|
||||
reloaded_frames = []
|
||||
|
||||
for frame_idx in range(num_frames):
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
reloaded_frames.append(frame)
|
||||
cap.release()
|
||||
|
||||
self._print_memory_step(f"Reloaded {len(reloaded_frames)} frames for mask application")
|
||||
|
||||
# Apply masks
|
||||
matted_frames = []
|
||||
for frame_idx, frame in enumerate(reloaded_frames):
|
||||
if frame_idx in video_segments:
|
||||
frame_masks = video_segments[frame_idx]
|
||||
combined_mask = self.sam2_model.get_combined_mask(frame_masks)
|
||||
|
||||
matted_frame = self.sam2_model.apply_mask_to_frame(
|
||||
frame, combined_mask,
|
||||
output_format=self.config.output.format,
|
||||
background_color=self.config.output.background_color
|
||||
)
|
||||
else:
|
||||
matted_frame = self._create_empty_mask_frame(frame)
|
||||
|
||||
matted_frames.append(matted_frame)
|
||||
|
||||
# Free reloaded frames
|
||||
del reloaded_frames
|
||||
self._aggressive_memory_cleanup(f"After mask application ({eye_name} eye)")
|
||||
|
||||
return matted_frames
|
||||
|
||||
finally:
|
||||
# Always cleanup
|
||||
self.sam2_model.cleanup()
|
||||
|
||||
# Remove temporary video file
|
||||
try:
|
||||
if temp_video_path.exists():
|
||||
temp_video_path.unlink()
|
||||
except Exception as e:
|
||||
warnings.warn(f"Failed to cleanup temp video {temp_video_path}: {e}")
|
||||
|
||||
def _process_eye_sequence_with_validation(self,
|
||||
right_eye_frames: List[np.ndarray],
|
||||
@@ -259,6 +450,20 @@ class VR180Processor(VideoProcessor):
|
||||
|
||||
return validated_frames
|
||||
|
||||
def _create_empty_masks_from_count(self, num_frames: int, frame_shape: tuple) -> List[np.ndarray]:
|
||||
"""Create empty masks when no persons detected (without frame array)"""
|
||||
empty_frames = []
|
||||
for _ in range(num_frames):
|
||||
if self.config.output.format == "alpha":
|
||||
# Transparent output
|
||||
output = np.zeros((frame_shape[0], frame_shape[1], 4), dtype=np.uint8)
|
||||
else:
|
||||
# Green screen background
|
||||
output = np.full((frame_shape[0], frame_shape[1], 3),
|
||||
self.config.output.background_color, dtype=np.uint8)
|
||||
empty_frames.append(output)
|
||||
return empty_frames
|
||||
|
||||
def _get_mask_area(self, frame: np.ndarray) -> float:
|
||||
"""Get mask area from processed frame"""
|
||||
if frame.shape[2] == 4: # Alpha channel
|
||||
|
||||
Reference in New Issue
Block a user