stereo mask working

2025-07-31 11:13:31 -07:00
parent 0057017ac4
commit b97a3752a7
8 changed files with 1247 additions and 206 deletions
--- a/config.yaml
+++ b/config.yaml
@@ -1,75 +1,137 @@
 # YOLO + SAM2 Video Processing Configuration
+# This file serves as a complete reference for all available settings.

 input:
+  # Full path to the input video file.
  video_path: "/path/to/input/video.mp4"
  
 output:
+  # Directory where all output files and segments will be stored.
  directory: "/path/to/output/"
+  # Filename for the final assembled video.
  filename: "processed_video.mp4"
  
 processing:
-  # Duration of each video segment in seconds
+  # Duration of each video segment in seconds. Shorter segments use less memory.
  segment_duration: 5
  
-  # Scale factor for SAM2 inference (0.5 = half resolution)
+  # Scale factor for SAM2 inference (e.g., 0.5 = half resolution).
+  # Lower values are faster but may reduce mask quality.
  inference_scale: 0.5
  
-  # YOLO detection confidence threshold
+  # YOLO detection confidence threshold (0.0 to 1.0).
  yolo_confidence: 0.6
  
-  # Which segments to run YOLO detection on
-  # Options: "all", [0, 5, 10], or [] for default (all)
+  # Which segments to run YOLO detection on.
+  # Options: "all", a list of specific segment indices (e.g., [0, 10, 20]), or [] for default ("all").
  detect_segments: "all"
  
-  # VR180 separate eye processing mode (default: false for backward compatibility)
+  # --- VR180 Stereo Processing ---
+  # Enables special logic for VR180 SBS video. When false, video is treated as a single view.
  separate_eye_processing: false
  
-  # Enable full greenscreen fallback when no humans detected (only used with separate_eye_processing)
+  # Threshold for stereo mask agreement (Intersection over Union).
+  # A value of 0.5 means masks must overlap by 50% to be considered a pair.
+  stereo_iou_threshold: 0.5
+  
+  # Factor to reduce YOLO confidence by if no stereo pairs are found on the first try (e.g., 0.8 = 20% reduction).
+  confidence_reduction_factor: 0.8
+  
+  # If no humans are detected in a segment, create a full green screen video.
+  # Only used when separate_eye_processing is true.
  enable_greenscreen_fallback: true
  
-  # Pixel overlap between left/right eyes for blending (optional, default: 0)
+  # Pixel overlap between left/right eyes for smoother blending at the center seam.
  eye_overlap_pixels: 0
  
 models:
-  # YOLO detection mode: "detection" (bounding boxes) or "segmentation" (direct masks)
-  yolo_mode: "segmentation"  # Default: existing behavior, Options: "detection", "segmentation"
+  # YOLO mode: "detection" (for bounding boxes) or "segmentation" (for direct masks).
+  # "segmentation" is generally recommended as it provides initial masks to SAM2.
+  yolo_mode: "segmentation"
  
-  # YOLO model paths for different modes
-  yolo_detection_model: "models/yolo/yolo11l.pt"      # Regular YOLO for detection mode
-  yolo_segmentation_model: "models/yolo/yolo11x-seg.pt"  # Segmentation YOLO for segmentation mode
+  # Path to the YOLO model for "detection" mode.
+  yolo_detection_model: "models/yolo/yolo11l.pt"
+  # Path to the YOLO model for "segmentation" mode.
+  yolo_segmentation_model: "models/yolo/yolo11x-seg.pt"
  
-  # SAM2 model configuration
+  # --- SAM2 Model Configuration ---
  sam2_checkpoint: "models/sam2/checkpoints/sam2.1_hiera_small.pt"
  sam2_config: "models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml"
-  
+  # (Experimental) Use optimized VOS predictor for a significant speedup. Requires PyTorch 2.5.1+.
+  sam2_vos_optimized: false
+
 video:
-  # Use NVIDIA hardware encoding (requires NVENC-capable GPU)
+  # Use NVIDIA's NVENC for hardware-accelerated video encoding.
  use_nvenc: true
  
-  # Output video bitrate
+  # Bitrate for the output video (e.g., "25M", "50M").
  output_bitrate: "50M"
  
-  # Preserve original audio track
+  # If true, the audio track from the input video will be copied to the final output.
  preserve_audio: true
  
-  # Force keyframes for better segment boundaries
+  # Force keyframes at the start of each segment for clean cuts. Recommended to keep true.
  force_keyframes: true
  
 advanced:
-  # Green screen color (RGB values)
+  # RGB color for the green screen background.
  green_color: [0, 255, 0]
  
-  # Blue screen color for second object (RGB values)  
+  # RGB color for the second object's mask (typically the right eye in VR180).
  blue_color: [255, 0, 0]
  
-  # YOLO human class ID (0 for COCO person class)
+  # The class ID for humans in the YOLO model (COCO default is 0 for "person").
  human_class_id: 0
  
-  # GPU memory management
+  # If true, deletes intermediate files like segment videos after processing.
  cleanup_intermediate_files: true
  
-  # Logging level (DEBUG, INFO, WARNING, ERROR)
+  # Logging level: DEBUG, INFO, WARNING, ERROR.
  log_level: "INFO"

-  # Save debug frames with YOLO detections visualized
+  # If true, saves debug images for YOLO detections.
  save_yolo_debug_frames: true
+  
+  # --- Mid-Segment Re-detection ---
+  # Re-run YOLO at intervals within a segment to correct tracking drift.
+  enable_mid_segment_detection: false
+  redetection_interval: 30 # Frames between re-detections.
+  max_redetections_per_segment: 10
+
+  # --- Parallel Processing Optimizations ---
+  # (Experimental) Generate low-res videos for upcoming segments in the background.
+  enable_background_lowres_generation: false
+  max_concurrent_lowres: 2 # Max parallel FFmpeg processes.
+  lowres_segments_ahead: 2 # How many segments to prepare in advance.
+  use_ffmpeg_lowres: true # Use FFmpeg (faster) instead of OpenCV for low-res creation.
+
+# --- Mask Quality Enhancement Settings ---
+# These settings allow fine-tuning of the final mask appearance.
+# Enabling these may increase processing time.
+mask_processing:
+  # Edge feathering and blurring for smoother transitions.
+  enable_edge_blur: true
+  edge_blur_radius: 3
+  edge_blur_sigma: 0.5
+  
+  # Temporal smoothing to reduce mask flickering between frames.
+  enable_temporal_smoothing: false
+  temporal_blend_weight: 0.2
+  temporal_history_frames: 2
+  
+  # Clean up small noise and holes in the mask.
+  # Generally not needed when using SAM2, as its masks are high quality.
+  enable_morphological_cleaning: false
+  morphology_kernel_size: 5
+  min_component_size: 500
+  
+  # Method for blending the mask edge with the background.
+  # Options: "linear" (fastest), "gaussian", "sigmoid".
+  alpha_blending_mode: "linear"
+  alpha_transition_width: 1
+  
+  # Advanced edge-preserving smoothing filter. Slower but can produce higher quality edges.
+  enable_bilateral_filter: false
+  bilateral_d: 9
+  bilateral_sigma_color: 75
+  bilateral_sigma_space: 75