# YOLO + SAM2 Video Processing Configuration # This file serves as a complete reference for all available settings. input: # Full path to the input video file. video_path: "/path/to/input/video.mp4" output: # Directory where all output files and segments will be stored. directory: "/path/to/output/" # Filename for the final assembled video. filename: "processed_video.mp4" processing: # Duration of each video segment in seconds. Shorter segments use less memory. segment_duration: 5 # Scale factor for SAM2 inference (e.g., 0.5 = half resolution). # Lower values are faster but may reduce mask quality. inference_scale: 0.5 # YOLO detection confidence threshold (0.0 to 1.0). yolo_confidence: 0.6 # Which segments to run YOLO detection on. # Options: "all", a list of specific segment indices (e.g., [0, 10, 20]), or [] for default ("all"). detect_segments: "all" # --- VR180 Stereo Processing --- # Enables special logic for VR180 SBS video. When false, video is treated as a single view. separate_eye_processing: false # Threshold for stereo mask agreement (Intersection over Union). # A value of 0.5 means masks must overlap by 50% to be considered a pair. stereo_iou_threshold: 0.5 # Factor to reduce YOLO confidence by if no stereo pairs are found on the first try (e.g., 0.8 = 20% reduction). confidence_reduction_factor: 0.8 # If no humans are detected in a segment, create a full green screen video. # Only used when separate_eye_processing is true. enable_greenscreen_fallback: true # Pixel overlap between left/right eyes for smoother blending at the center seam. eye_overlap_pixels: 0 models: # YOLO mode: "detection" (for bounding boxes) or "segmentation" (for direct masks). # "segmentation" is generally recommended as it provides initial masks to SAM2. yolo_mode: "segmentation" # Path to the YOLO model for "detection" mode. yolo_detection_model: "models/yolo/yolo11l.pt" # Path to the YOLO model for "segmentation" mode. yolo_segmentation_model: "models/yolo/yolo11x-seg.pt" # --- SAM2 Model Configuration --- sam2_checkpoint: "models/sam2/checkpoints/sam2.1_hiera_small.pt" sam2_config: "models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml" # (Experimental) Use optimized VOS predictor for a significant speedup. Requires PyTorch 2.5.1+. sam2_vos_optimized: false video: # Use NVIDIA's NVENC for hardware-accelerated video encoding. use_nvenc: true # Bitrate for the output video (e.g., "25M", "50M"). output_bitrate: "50M" # If true, the audio track from the input video will be copied to the final output. preserve_audio: true # Force keyframes at the start of each segment for clean cuts. Recommended to keep true. force_keyframes: true advanced: # RGB color for the green screen background. green_color: [0, 255, 0] # RGB color for the second object's mask (typically the right eye in VR180). blue_color: [255, 0, 0] # The class ID for humans in the YOLO model (COCO default is 0 for "person"). human_class_id: 0 # If true, deletes intermediate files like segment videos after processing. cleanup_intermediate_files: true # Logging level: DEBUG, INFO, WARNING, ERROR. log_level: "INFO" # If true, saves debug images for YOLO detections. save_yolo_debug_frames: true # --- Mid-Segment Re-detection --- # Re-run YOLO at intervals within a segment to correct tracking drift. enable_mid_segment_detection: false redetection_interval: 30 # Frames between re-detections. max_redetections_per_segment: 10 # --- Parallel Processing Optimizations --- # (Experimental) Generate low-res videos for upcoming segments in the background. enable_background_lowres_generation: false max_concurrent_lowres: 2 # Max parallel FFmpeg processes. lowres_segments_ahead: 2 # How many segments to prepare in advance. use_ffmpeg_lowres: true # Use FFmpeg (faster) instead of OpenCV for low-res creation. # --- Mask Quality Enhancement Settings --- # These settings allow fine-tuning of the final mask appearance. # Enabling these may increase processing time. mask_processing: # Edge feathering and blurring for smoother transitions. enable_edge_blur: true edge_blur_radius: 3 edge_blur_sigma: 0.5 # Temporal smoothing to reduce mask flickering between frames. enable_temporal_smoothing: false temporal_blend_weight: 0.2 temporal_history_frames: 2 # Clean up small noise and holes in the mask. # Generally not needed when using SAM2, as its masks are high quality. enable_morphological_cleaning: false morphology_kernel_size: 5 min_component_size: 500 # Method for blending the mask edge with the background. # Options: "linear" (fastest), "gaussian", "sigmoid". alpha_blending_mode: "linear" alpha_transition_width: 1 # Advanced edge-preserving smoothing filter. Slower but can produce higher quality edges. enable_bilateral_filter: false bilateral_d: 9 bilateral_sigma_color: 75 bilateral_sigma_space: 75