Files
samyolo_on_segments/core/mask_processor.py
2025-07-30 18:07:26 -07:00

942 lines
40 KiB
Python

"""
Mask processor module for applying green screen effects.
Handles applying masks to video frames to create green screen output.
"""
import os
import cv2
import numpy as np
import cupy as cp
import subprocess
import sys
import logging
from typing import Dict, List, Any, Optional, Tuple
from collections import deque
logger = logging.getLogger(__name__)
class MaskProcessor:
"""Handles mask application and green screen processing with quality enhancements."""
def __init__(self, green_color: List[int] = [0, 255, 0], blue_color: List[int] = [255, 0, 0],
mask_quality_config: Optional[Dict[str, Any]] = None,
output_mode: str = "green_screen"):
"""
Initialize mask processor with quality enhancement options.
Args:
green_color: RGB color for green screen background
blue_color: RGB color for second object (if needed)
mask_quality_config: Configuration dictionary for mask quality improvements
output_mode: Output mode - "green_screen" or "alpha_channel"
"""
self.green_color = green_color
self.blue_color = blue_color
self.output_mode = output_mode
self.use_gpu = self._check_gpu_availability()
# Mask quality configuration with defaults
if mask_quality_config is None:
mask_quality_config = {}
self.enable_edge_blur = mask_quality_config.get('enable_edge_blur', False)
self.edge_blur_radius = mask_quality_config.get('edge_blur_radius', 3)
self.edge_blur_sigma = mask_quality_config.get('edge_blur_sigma', 1.5)
self.enable_temporal_smoothing = mask_quality_config.get('enable_temporal_smoothing', False)
self.temporal_blend_weight = mask_quality_config.get('temporal_blend_weight', 0.3)
self.temporal_history_frames = mask_quality_config.get('temporal_history_frames', 3)
self.enable_morphological_cleaning = mask_quality_config.get('enable_morphological_cleaning', False)
self.morphology_kernel_size = mask_quality_config.get('morphology_kernel_size', 5)
self.min_component_size = mask_quality_config.get('min_component_size', 500)
self.alpha_blending_mode = mask_quality_config.get('alpha_blending_mode', 'gaussian')
self.alpha_transition_width = mask_quality_config.get('alpha_transition_width', 10)
self.enable_bilateral_filter = mask_quality_config.get('enable_bilateral_filter', False)
self.bilateral_d = mask_quality_config.get('bilateral_d', 9)
self.bilateral_sigma_color = mask_quality_config.get('bilateral_sigma_color', 75)
self.bilateral_sigma_space = mask_quality_config.get('bilateral_sigma_space', 75)
# Temporal history buffer for mask smoothing
self.mask_history = deque(maxlen=self.temporal_history_frames)
# Log configuration
if any([self.enable_edge_blur, self.enable_temporal_smoothing, self.enable_morphological_cleaning]):
logger.info("Mask quality enhancements enabled:")
if self.enable_edge_blur:
logger.info(f" Edge blur: radius={self.edge_blur_radius}, sigma={self.edge_blur_sigma}")
if self.enable_temporal_smoothing:
logger.info(f" Temporal smoothing: weight={self.temporal_blend_weight}, history={self.temporal_history_frames}")
if self.enable_morphological_cleaning:
logger.info(f" Morphological cleaning: kernel={self.morphology_kernel_size}, min_size={self.min_component_size}")
logger.info(f" Alpha blending: mode={self.alpha_blending_mode}, width={self.alpha_transition_width}")
else:
logger.info("Mask quality enhancements disabled - using standard binary masking")
logger.info(f"Output mode: {self.output_mode}")
def _check_gpu_availability(self) -> bool:
"""Check if CuPy GPU acceleration is available."""
try:
import cupy as cp
# Test GPU availability
test_array = cp.array([1, 2, 3])
_ = test_array * 2
logger.info("GPU acceleration available via CuPy")
return True
except Exception as e:
logger.warning(f"GPU acceleration not available, using CPU: {e}")
return False
def enhance_mask_quality(self, mask: np.ndarray) -> np.ndarray:
"""
Apply all enabled mask quality enhancements.
Args:
mask: Input binary mask
Returns:
Enhanced mask with quality improvements applied
"""
enhanced_mask = mask.copy()
# 1. Morphological cleaning
if self.enable_morphological_cleaning:
enhanced_mask = self._clean_mask_morphologically(enhanced_mask)
# 2. Temporal smoothing
if self.enable_temporal_smoothing:
enhanced_mask = self._apply_temporal_smoothing(enhanced_mask)
# 3. Edge enhancement and blurring
if self.enable_edge_blur:
enhanced_mask = self._apply_edge_blur(enhanced_mask)
# 4. Bilateral filtering (if enabled)
if self.enable_bilateral_filter:
enhanced_mask = self._apply_bilateral_filter(enhanced_mask)
return enhanced_mask
def _clean_mask_morphologically(self, mask: np.ndarray) -> np.ndarray:
"""
Clean mask using morphological operations to remove noise and small artifacts.
Args:
mask: Input binary mask
Returns:
Cleaned mask
"""
# Convert to uint8 for OpenCV operations
mask_uint8 = (mask * 255).astype(np.uint8)
# Create morphological kernel
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
(self.morphology_kernel_size, self.morphology_kernel_size))
# Opening operation (erosion followed by dilation) to remove small noise
cleaned = cv2.morphologyEx(mask_uint8, cv2.MORPH_OPEN, kernel)
# Closing operation (dilation followed by erosion) to fill small holes
cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_CLOSE, kernel)
# Remove small connected components
if self.min_component_size > 0:
cleaned = self._remove_small_components(cleaned)
return (cleaned / 255.0).astype(np.float32)
def _remove_small_components(self, mask: np.ndarray) -> np.ndarray:
"""
Remove connected components smaller than minimum size.
Args:
mask: Input binary mask (uint8)
Returns:
Mask with small components removed
"""
# Find connected components
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(mask, connectivity=8)
# Create output mask
output_mask = np.zeros_like(mask)
# Keep components larger than minimum size (skip background label 0)
for i in range(1, num_labels):
component_size = stats[i, cv2.CC_STAT_AREA]
if component_size >= self.min_component_size:
output_mask[labels == i] = 255
return output_mask
def _apply_temporal_smoothing(self, mask: np.ndarray) -> np.ndarray:
"""
Apply temporal smoothing using mask history.
Args:
mask: Current frame mask
Returns:
Temporally smoothed mask
"""
if len(self.mask_history) == 0:
# First frame, no history to blend with
self.mask_history.append(mask.copy())
return mask
# Blend with previous frames using weighted average
smoothed_mask = mask.astype(np.float32)
total_weight = 1.0
for i, hist_mask in enumerate(reversed(self.mask_history)):
# Exponential decay: more recent frames have higher weight
frame_weight = self.temporal_blend_weight * (0.8 ** i)
smoothed_mask += hist_mask.astype(np.float32) * frame_weight
total_weight += frame_weight
# Normalize by total weight
smoothed_mask /= total_weight
# Update history
self.mask_history.append(mask.copy())
return smoothed_mask
def _apply_edge_blur(self, mask: np.ndarray) -> np.ndarray:
"""
Apply Gaussian blur to mask edges for smooth transitions.
Args:
mask: Input mask
Returns:
Mask with blurred edges
"""
# Apply Gaussian blur
kernel_size = 2 * self.edge_blur_radius + 1
blurred_mask = cv2.GaussianBlur(mask.astype(np.float32),
(kernel_size, kernel_size),
self.edge_blur_sigma)
return blurred_mask
def _apply_bilateral_filter(self, mask: np.ndarray) -> np.ndarray:
"""
Apply bilateral filtering for edge-preserving smoothing.
Args:
mask: Input mask
Returns:
Filtered mask
"""
# Convert to uint8 for bilateral filter
mask_uint8 = (mask * 255).astype(np.uint8)
# Apply bilateral filter
filtered = cv2.bilateralFilter(mask_uint8, self.bilateral_d,
self.bilateral_sigma_color,
self.bilateral_sigma_space)
return (filtered / 255.0).astype(np.float32)
def _create_alpha_mask(self, mask: np.ndarray) -> np.ndarray:
"""
Create alpha mask with smooth transitions based on blending mode.
Args:
mask: Input binary/float mask
Returns:
Alpha mask with smooth transitions
"""
if self.alpha_blending_mode == "linear":
return mask
elif self.alpha_blending_mode == "gaussian":
# Use distance transform for smooth falloff
binary_mask = (mask > 0.5).astype(np.uint8)
# Distance transform from mask edges
dist_inside = cv2.distanceTransform(binary_mask, cv2.DIST_L2, 5)
dist_outside = cv2.distanceTransform(1 - binary_mask, cv2.DIST_L2, 5)
# Create smooth alpha based on distance
alpha = np.zeros_like(mask, dtype=np.float32)
transition_width = self.alpha_transition_width
# Inside mask: fade from edge
alpha[binary_mask > 0] = np.minimum(1.0, dist_inside[binary_mask > 0] / transition_width)
# Outside mask: fade to zero
alpha[binary_mask == 0] = np.maximum(0.0, 1.0 - dist_outside[binary_mask == 0] / transition_width)
return alpha
elif self.alpha_blending_mode == "sigmoid":
# Sigmoid-based smooth transition
return 1.0 / (1.0 + np.exp(-10 * (mask - 0.5)))
else:
return mask
def apply_green_mask(self, frame: np.ndarray, masks: List[np.ndarray]) -> np.ndarray:
"""
Apply green screen mask to a frame with quality enhancements.
Args:
frame: Input video frame (BGR format)
masks: List of object masks to apply
Returns:
Frame with green screen background and enhanced mask quality
"""
# Combine all masks into a single mask
combined_mask = self._combine_masks(masks)
# Apply quality enhancements
enhanced_mask = self.enhance_mask_quality(combined_mask)
# Create alpha mask for smooth blending
alpha_mask = self._create_alpha_mask(enhanced_mask)
# Apply mask using alpha blending
if self.use_gpu:
return self._apply_green_mask_gpu_enhanced(frame, alpha_mask)
else:
return self._apply_green_mask_cpu_enhanced(frame, alpha_mask)
def apply_mask_with_alpha(self, frame: np.ndarray, masks: List[np.ndarray]) -> np.ndarray:
"""
Apply mask to create RGBA frame with alpha channel.
Args:
frame: Input video frame (BGR format)
masks: List of object masks to apply
Returns:
RGBA frame with alpha channel
"""
# Combine all masks into a single mask
combined_mask = self._combine_masks(masks)
# Apply quality enhancements
enhanced_mask = self.enhance_mask_quality(combined_mask)
# Create alpha mask for smooth blending
alpha_mask = self._create_alpha_mask(enhanced_mask)
# Resize alpha mask to match frame if needed
if alpha_mask.shape != frame.shape[:2]:
alpha_mask = cv2.resize(alpha_mask, (frame.shape[1], frame.shape[0]))
# Convert BGR to BGRA
bgra_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
# Set alpha channel
bgra_frame[:, :, 3] = (alpha_mask * 255).astype(np.uint8)
return bgra_frame
def _combine_masks(self, masks: List[np.ndarray]) -> np.ndarray:
"""
Combine multiple object masks into a single mask.
Args:
masks: List of object masks
Returns:
Combined mask
"""
if not masks:
return np.zeros((0, 0), dtype=np.float32)
# Start with first mask
combined_mask = masks[0].squeeze().astype(np.float32)
# Combine with remaining masks using logical OR
for mask in masks[1:]:
mask_squeezed = mask.squeeze().astype(np.float32)
if mask_squeezed.shape != combined_mask.shape:
# Resize mask to match combined mask
mask_squeezed = cv2.resize(mask_squeezed,
(combined_mask.shape[1], combined_mask.shape[0]),
interpolation=cv2.INTER_NEAREST)
combined_mask = np.maximum(combined_mask, mask_squeezed)
return combined_mask
def reset_temporal_history(self):
"""Reset temporal history buffer. Call this when starting a new segment."""
self.mask_history.clear()
logger.debug("Temporal history buffer reset")
def _apply_green_mask_gpu_enhanced(self, frame: np.ndarray, alpha_mask: np.ndarray) -> np.ndarray:
"""GPU-accelerated green mask application with alpha blending using CuPy (Phase 1 optimized)."""
try:
# Convert to CuPy arrays with optimized data transfer
frame_gpu = cp.asarray(frame, dtype=cp.uint8)
alpha_gpu = cp.asarray(alpha_mask, dtype=cp.float32)
# Resize alpha mask to match frame if needed (vectorized operation)
if alpha_gpu.shape != frame_gpu.shape[:2]:
# Use CuPy's resize instead of OpenCV for GPU optimization
alpha_gpu = cp.array(cv2.resize(cp.asnumpy(alpha_gpu),
(frame_gpu.shape[1], frame_gpu.shape[0])))
# Create green background (optimized broadcasting)
green_color_gpu = cp.array(self.green_color, dtype=cp.uint8)
green_background = cp.broadcast_to(green_color_gpu, frame_gpu.shape)
# Apply vectorized alpha blending with optimized memory access
alpha_3d = cp.expand_dims(alpha_gpu, axis=2)
# Use more efficient computation with explicit typing
frame_float = frame_gpu.astype(cp.float32)
green_float = green_background.astype(cp.float32)
# Vectorized blending operation
result_frame = cp.clip(alpha_3d * frame_float + (1.0 - alpha_3d) * green_float, 0, 255)
return cp.asnumpy(result_frame.astype(cp.uint8))
except Exception as e:
logger.error(f"GPU enhanced processing failed, falling back to CPU: {e}")
return self._apply_green_mask_cpu_enhanced(frame, alpha_mask)
def _apply_green_mask_cpu_enhanced(self, frame: np.ndarray, alpha_mask: np.ndarray) -> np.ndarray:
"""CPU-based green mask application with alpha blending (Phase 1 optimized)."""
# Resize alpha mask to match frame if needed
if alpha_mask.shape != frame.shape[:2]:
alpha_mask = cv2.resize(alpha_mask, (frame.shape[1], frame.shape[0]))
# Create green background with broadcasting (more efficient)
green_color = np.array(self.green_color, dtype=np.uint8)
green_background = np.broadcast_to(green_color, frame.shape)
# Apply optimized alpha blending with explicit data types
alpha_3d = np.expand_dims(alpha_mask.astype(np.float32), axis=2)
# Vectorized blending with optimized memory access
frame_float = frame.astype(np.float32)
green_float = green_background.astype(np.float32)
result_frame = np.clip(alpha_3d * frame_float + (1.0 - alpha_3d) * green_float, 0, 255)
return result_frame.astype(np.uint8)
def apply_colored_mask(self, frame: np.ndarray, masks_a: List[np.ndarray],
masks_b: List[np.ndarray]) -> np.ndarray:
"""
Apply colored masks for visualization (green and blue).
Args:
frame: Input video frame
masks_a: Masks for object A (green)
masks_b: Masks for object B (blue)
Returns:
Frame with colored masks applied
"""
colored_mask = np.zeros_like(frame)
# Apply green color to masks_a
for mask in masks_a:
mask = mask.squeeze()
if mask.shape != frame.shape[:2]:
mask = cv2.resize(mask, (frame.shape[1], frame.shape[0]),
interpolation=cv2.INTER_NEAREST)
colored_mask[mask > 0] = self.green_color
# Apply blue color to masks_b
for mask in masks_b:
mask = mask.squeeze()
if mask.shape != frame.shape[:2]:
mask = cv2.resize(mask, (frame.shape[1], frame.shape[0]),
interpolation=cv2.INTER_NEAREST)
colored_mask[mask > 0] = self.blue_color
return colored_mask
def _precompute_upscaled_masks(self, video_segments: Dict[int, Dict[int, np.ndarray]],
target_width: int, target_height: int) -> Dict[int, Dict[int, np.ndarray]]:
"""
Pre-compute all upscaled masks to avoid per-frame upscaling.
Args:
video_segments: Dictionary of frame masks from SAM2
target_width: Target frame width
target_height: Target frame height
Returns:
Dictionary with pre-upscaled masks
"""
logger.info(f"Pre-computing upscaled masks for {len(video_segments)} frames")
upscaled_segments = {}
for frame_idx, frame_masks in video_segments.items():
upscaled_frame_masks = {}
for obj_id, mask in frame_masks.items():
mask = mask.squeeze()
if mask.shape != (target_height, target_width):
upscaled_mask = cv2.resize(mask.astype(np.uint8),
(target_width, target_height),
interpolation=cv2.INTER_NEAREST)
upscaled_frame_masks[obj_id] = upscaled_mask
else:
upscaled_frame_masks[obj_id] = mask.astype(np.uint8)
upscaled_segments[frame_idx] = upscaled_frame_masks
logger.info(f"Pre-computed upscaled masks for {len(upscaled_segments)} frames")
return upscaled_segments
def process_and_save_output_video(self, video_path: str, output_video_path: str,
video_segments: Dict[int, Dict[int, np.ndarray]],
use_nvenc: bool = False, bitrate: str = "50M") -> bool:
"""
Process high-resolution frames, apply upscaled masks, and save the output video.
Args:
video_path: Path to input video
output_video_path: Path to save output video
video_segments: Dictionary of frame masks
use_nvenc: Whether to use NVIDIA hardware encoding
bitrate: Output video bitrate
Returns:
True if successful
"""
try:
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
logger.error(f"Could not open video: {video_path}")
return False
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
logger.info(f"Processing video: {frame_width}x{frame_height} @ {fps}fps, {total_frames} frames")
# Pre-compute all upscaled masks (Phase 1 optimization)
upscaled_segments = self._precompute_upscaled_masks(video_segments, frame_width, frame_height)
# Setup VideoWriter
if self.output_mode == "alpha_channel":
# For alpha channel, we need a codec that supports transparency
success = self._setup_alpha_encoder(output_video_path, frame_width, frame_height, fps, bitrate)
if not success:
logger.error("Failed to setup alpha channel encoder")
cap.release()
return False
use_nvenc = False # Override NVENC for alpha channel
elif use_nvenc:
success = self._setup_nvenc_encoder(output_video_path, frame_width, frame_height, fps, bitrate)
if not success:
logger.warning("NVENC setup failed, falling back to OpenCV")
use_nvenc = False
if not use_nvenc and self.output_mode != "alpha_channel":
# Use OpenCV VideoWriter
fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Use mp4v for better compatibility
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
if not out.isOpened():
logger.error("Failed to create output video writer")
cap.release()
return False
# Process frames with batch reading (Phase 1 optimization)
frame_idx = 0
processed_frames = 0
batch_size = 10 # Process frames in batches for better I/O performance
frame_buffer = []
# Pre-fill frame buffer
for _ in range(min(batch_size, len(upscaled_segments))):
ret, frame = cap.read()
if ret:
frame_buffer.append(frame)
else:
break
buffer_idx = 0
while frame_idx < len(upscaled_segments) and buffer_idx < len(frame_buffer):
frame = frame_buffer[buffer_idx]
if frame_idx in upscaled_segments:
# Get pre-computed upscaled masks for this frame (Phase 1 optimization)
upscaled_masks = [upscaled_segments[frame_idx][obj_id]
for obj_id in upscaled_segments[frame_idx]]
# Apply mask based on output mode (no upscaling needed - already done)
if self.output_mode == "alpha_channel":
result_frame = self.apply_mask_with_alpha(frame, upscaled_masks)
else:
result_frame = self.apply_green_mask(frame, upscaled_masks)
else:
# No mask for this frame
if self.output_mode == "alpha_channel":
# Create fully transparent frame for alpha channel mode
bgra_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
bgra_frame[:, :, 3] = 0 # Fully transparent
result_frame = bgra_frame
logger.warning(f"No mask for frame {frame_idx}, using transparent frame")
else:
# Use original frame for green screen mode
logger.warning(f"No mask for frame {frame_idx}, using original")
result_frame = frame
# Write frame
if self.output_mode == "alpha_channel" and hasattr(self, 'alpha_process'):
self.alpha_process.stdin.write(result_frame.tobytes())
elif use_nvenc and hasattr(self, 'nvenc_process'):
self.nvenc_process.stdin.write(result_frame.tobytes())
else:
out.write(result_frame)
processed_frames += 1
frame_idx += 1
buffer_idx += 1
# Refill buffer when needed
if buffer_idx >= len(frame_buffer) and frame_idx < len(upscaled_segments):
frame_buffer.clear()
buffer_idx = 0
# Read next batch
for _ in range(min(batch_size, len(upscaled_segments) - frame_idx)):
ret, frame = cap.read()
if ret:
frame_buffer.append(frame)
else:
break
# Progress logging
if processed_frames % 100 == 0:
logger.info(f"Processed {processed_frames}/{total_frames} frames")
# Cleanup
cap.release()
if self.output_mode == "alpha_channel" and hasattr(self, 'alpha_process'):
self.alpha_process.stdin.close()
self.alpha_process.wait()
if self.alpha_process.returncode != 0:
logger.error("Alpha channel encoding failed")
return False
elif use_nvenc and hasattr(self, 'nvenc_process'):
self.nvenc_process.stdin.close()
self.nvenc_process.wait()
if self.nvenc_process.returncode != 0:
logger.error("NVENC encoding failed")
return False
else:
out.release()
logger.info(f"Successfully processed {processed_frames} frames to {output_video_path}")
return True
except Exception as e:
logger.error(f"Error processing video: {e}")
return False
def _setup_nvenc_encoder(self, output_path: str, width: int, height: int,
fps: float, bitrate: str) -> bool:
"""Setup NVENC hardware encoder using FFmpeg."""
try:
# Determine encoder based on platform
if sys.platform == 'darwin':
encoder = 'hevc_videotoolbox'
else:
encoder = 'hevc_nvenc'
command = [
'ffmpeg',
'-y', # Overwrite output file
'-f', 'rawvideo',
'-vcodec', 'rawvideo',
'-pix_fmt', 'bgr24',
'-s', f'{width}x{height}',
'-r', str(fps),
'-i', '-', # Input from stdin
'-an', # No audio (will be added later)
'-vcodec', encoder,
'-pix_fmt', 'yuv420p', # Changed from nv12 for better compatibility
'-preset', 'slow',
'-b:v', bitrate,
output_path
]
self.nvenc_process = subprocess.Popen(command, stdin=subprocess.PIPE,
stderr=subprocess.PIPE)
logger.info(f"Initialized {encoder} hardware encoder")
return True
except Exception as e:
logger.error(f"Failed to setup NVENC encoder: {e}")
return False
def _setup_alpha_encoder(self, output_path: str, width: int, height: int,
fps: float, bitrate: str) -> bool:
"""Setup encoder for alpha channel video using FFmpeg with H.264/H.265."""
try:
# For VR180 SBS, we'll use H.265 (HEVC) with alpha channel
# Note: Standard H.264/H.265 don't support alpha directly,
# so we'll encode the alpha as a separate grayscale channel or use a special pixel format
# Determine encoder based on platform
if sys.platform == 'darwin':
encoder = 'hevc_videotoolbox'
else:
encoder = 'hevc_nvenc'
command = [
'ffmpeg',
'-y', # Overwrite output file
'-f', 'rawvideo',
'-vcodec', 'rawvideo',
'-pix_fmt', 'bgra', # BGRA for alpha channel
'-s', f'{width}x{height}',
'-r', str(fps),
'-i', '-', # Input from stdin
'-an', # No audio (will be added later)
'-c:v', encoder,
'-pix_fmt', 'yuv420p', # Standard pixel format
'-preset', 'slow',
'-b:v', bitrate,
'-tag:v', 'hvc1', # Required for some players
output_path
]
self.alpha_process = subprocess.Popen(command, stdin=subprocess.PIPE,
stderr=subprocess.PIPE)
self.alpha_output_path = output_path
logger.info(f"Initialized {encoder} for alpha channel output (will be encoded as transparency in RGB)")
return True
except Exception as e:
logger.error(f"Failed to setup alpha encoder: {e}")
return False
def process_segment(self, segment_info: dict, video_segments: Dict[int, Dict[int, np.ndarray]],
use_nvenc: bool = False, bitrate: str = "50M") -> bool:
"""
Process a single segment and save the output video.
Args:
segment_info: Segment information dictionary
video_segments: Dictionary of frame masks from SAM2
use_nvenc: Whether to use hardware encoding
bitrate: Output video bitrate
Returns:
True if successful
"""
input_video = segment_info['video_file']
if self.output_mode == "alpha_channel":
output_video = os.path.join(segment_info['directory'], f"output_{segment_info['index']}.mov")
else:
output_video = os.path.join(segment_info['directory'], f"output_{segment_info['index']}.mp4")
logger.info(f"Processing segment {segment_info['index']} with {self.output_mode}")
success = self.process_and_save_output_video(
input_video,
output_video,
video_segments,
use_nvenc,
bitrate
)
if success:
logger.info(f"Successfully created {self.output_mode} video: {output_video}")
else:
logger.error(f"Failed to process segment {segment_info['index']}")
return success
def create_full_greenscreen_frame(self, frame_shape: Tuple[int, int, int],
green_color: Optional[List[int]] = None) -> np.ndarray:
"""
Create a full greenscreen frame for fallback when no humans are detected.
Args:
frame_shape: Shape of the frame (height, width, channels)
green_color: RGB values for green screen color (uses default if None)
Returns:
Full greenscreen frame
"""
if green_color is None:
green_color = self.green_color
greenscreen_frame = np.full(frame_shape, green_color, dtype=np.uint8)
logger.debug(f"Created full greenscreen frame with shape {frame_shape}")
return greenscreen_frame
def process_greenscreen_only_segment(self, segment_info: dict,
green_color: Optional[List[int]] = None,
use_nvenc: bool = False, bitrate: str = "50M") -> bool:
"""
Create a full greenscreen segment when no humans are detected.
Used as fallback in separate eye processing mode.
Args:
segment_info: Segment information dictionary
green_color: RGB values for green screen color (uses default if None)
use_nvenc: Whether to use hardware encoding
bitrate: Output video bitrate
Returns:
True if greenscreen segment was created successfully
"""
segment_dir = segment_info['directory']
video_path = segment_info['video_file']
segment_idx = segment_info['index']
logger.info(f"Creating full greenscreen segment {segment_idx} (no humans detected)")
try:
# Get video properties
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
logger.error(f"Could not open video: {video_path}")
return False
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
cap.release()
# Create output video path
if self.output_mode == "alpha_channel":
output_video_path = os.path.join(segment_dir, f"output_{segment_idx}.mov")
else:
output_video_path = os.path.join(segment_dir, f"output_{segment_idx}.mp4")
# Create greenscreen frame
if green_color is None:
green_color = self.green_color
greenscreen_frame = self.create_full_greenscreen_frame(
(height, width, 3), green_color
)
# Setup video writer based on mode and hardware encoding preference
if use_nvenc:
success = self._write_greenscreen_with_nvenc(
output_video_path, greenscreen_frame, frame_count, fps, bitrate
)
else:
success = self._write_greenscreen_with_opencv(
output_video_path, greenscreen_frame, frame_count, fps
)
if not success:
logger.error(f"Failed to write greenscreen video for segment {segment_idx}")
return False
# Create empty mask file (black mask since no humans detected)
mask_output_path = os.path.join(segment_dir, "mask.png")
black_mask = np.zeros((height, width, 3), dtype=np.uint8)
cv2.imwrite(mask_output_path, black_mask)
# Mark segment as completed
output_done_file = os.path.join(segment_dir, "output_frames_done")
with open(output_done_file, 'w') as f:
f.write(f"Greenscreen segment {segment_idx} completed successfully\n")
logger.info(f"Successfully created greenscreen segment {segment_idx}")
return True
except Exception as e:
logger.error(f"Error creating greenscreen segment {segment_idx}: {e}")
return False
def _write_greenscreen_with_opencv(self, output_path: str, greenscreen_frame: np.ndarray,
frame_count: int, fps: float) -> bool:
"""Write greenscreen video using OpenCV VideoWriter."""
try:
if self.output_mode == "alpha_channel":
# For alpha channel mode, create fully transparent frames
bgra_frame = cv2.cvtColor(greenscreen_frame, cv2.COLOR_BGR2BGRA)
bgra_frame[:, :, 3] = 0 # Fully transparent
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps,
(greenscreen_frame.shape[1], greenscreen_frame.shape[0]), True)
frame_to_write = bgra_frame[:, :, :3] # OpenCV expects BGR for mp4v
else:
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps,
(greenscreen_frame.shape[1], greenscreen_frame.shape[0]))
frame_to_write = greenscreen_frame
if not out.isOpened():
logger.error(f"Failed to open video writer for {output_path}")
return False
# Write identical greenscreen frames
for _ in range(frame_count):
out.write(frame_to_write)
out.release()
logger.debug(f"Wrote {frame_count} greenscreen frames using OpenCV")
return True
except Exception as e:
logger.error(f"Error writing greenscreen with OpenCV: {e}")
return False
def _write_greenscreen_with_nvenc(self, output_path: str, greenscreen_frame: np.ndarray,
frame_count: int, fps: float, bitrate: str) -> bool:
"""Write greenscreen video using NVENC hardware encoding."""
try:
# Setup NVENC encoder
if not self._setup_nvenc_encoder(output_path,
greenscreen_frame.shape[1],
greenscreen_frame.shape[0],
fps, bitrate):
logger.warning("NVENC setup failed for greenscreen, falling back to OpenCV")
return self._write_greenscreen_with_opencv(output_path, greenscreen_frame, frame_count, fps)
# Write identical greenscreen frames
for _ in range(frame_count):
self.nvenc_process.stdin.write(greenscreen_frame.tobytes())
# Finalize encoding
self.nvenc_process.stdin.close()
self.nvenc_process.wait()
if self.nvenc_process.returncode != 0:
logger.error("NVENC encoding failed for greenscreen")
return False
logger.debug(f"Wrote {frame_count} greenscreen frames using NVENC")
return True
except Exception as e:
logger.error(f"Error writing greenscreen with NVENC: {e}")
return False
def has_valid_masks(self, video_segments: Optional[Dict[int, Dict[int, np.ndarray]]]) -> bool:
"""
Check if video segments contain valid masks.
Args:
video_segments: Video segments dictionary from SAM2
Returns:
True if valid masks are found
"""
if not video_segments:
return False
# Check if any frame has non-empty masks
for frame_idx, frame_masks in video_segments.items():
for obj_id, mask in frame_masks.items():
if mask is not None and np.any(mask):
return True
return False