#!/usr/bin/env python3 """ Main entry point for YOLO + SAM2 video processing pipeline. Processes long videos by splitting into segments, detecting humans with YOLO, and creating green screen masks with SAM2. """ import os import sys import argparse import cv2 import numpy as np from typing import List # Add project root to path sys.path.append(os.path.dirname(__file__)) from core.config_loader import ConfigLoader from core.video_splitter import VideoSplitter from core.yolo_detector import YOLODetector from core.sam2_processor import SAM2Processor from core.mask_processor import MaskProcessor from core.video_assembler import VideoAssembler from utils.logging_utils import setup_logging, get_logger from utils.file_utils import ensure_directory from utils.status_utils import print_processing_status, cleanup_incomplete_segment logger = get_logger(__name__) def parse_arguments(): """Parse command line arguments.""" parser = argparse.ArgumentParser( description="YOLO + SAM2 Video Processing Pipeline" ) parser.add_argument( "--config", type=str, required=True, help="Path to YAML configuration file" ) parser.add_argument( "--log-file", type=str, help="Optional log file path" ) parser.add_argument( "--status", action="store_true", help="Show processing status and exit" ) parser.add_argument( "--cleanup-segment", type=int, help="Clean up a specific segment for restart (segment index)" ) return parser.parse_args() def validate_dependencies(): """Validate that required dependencies are available.""" try: import torch import cv2 import numpy as np import cupy as cp from ultralytics import YOLO from sam2.build_sam import build_sam2_video_predictor logger.info("All dependencies validated successfully") return True except ImportError as e: logger.error(f"Missing dependency: {e}") logger.error("Please install requirements: pip install -r requirements.txt") return False def create_yolo_mask_debug_frame(detections: List[dict], video_path: str, output_path: str, scale: float = 1.0) -> bool: """ Create debug visualization for YOLO direct masks. Args: detections: List of YOLO detections with masks video_path: Path to video file output_path: Path to save debug image scale: Scale factor for frame processing Returns: True if debug frame was created successfully """ try: # Load first frame cap = cv2.VideoCapture(video_path) ret, original_frame = cap.read() cap.release() if not ret: logger.error("Could not read first frame for YOLO mask debug") return False # Scale frame if needed if scale != 1.0: original_frame = cv2.resize(original_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR) debug_frame = original_frame.copy() # Define colors for each object colors = { 1: (0, 255, 0), # Green for Object 1 (Left eye) 2: (255, 0, 0), # Blue for Object 2 (Right eye) } # Get detections with masks detections_with_masks = [d for d in detections if d.get('has_mask', False)] # Overlay masks with transparency obj_id = 1 for detection in detections_with_masks[:2]: # Up to 2 objects mask = detection['mask'] # Resize mask to match frame if needed if mask.shape != original_frame.shape[:2]: mask = cv2.resize(mask.astype(np.float32), (original_frame.shape[1], original_frame.shape[0]), interpolation=cv2.INTER_NEAREST) mask = mask > 0.5 mask = mask.astype(bool) # Apply colored overlay color = colors.get(obj_id, (128, 128, 128)) overlay = debug_frame.copy() overlay[mask] = color # Blend with original (30% overlay, 70% original) cv2.addWeighted(overlay, 0.3, debug_frame, 0.7, 0, debug_frame) # Draw outline contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) cv2.drawContours(debug_frame, contours, -1, color, 2) logger.info(f"YOLO Mask Debug: Object {obj_id} mask - shape: {mask.shape}, pixels: {np.sum(mask)}") obj_id += 1 # Add title and source info title = f"YOLO Direct Masks: {len(detections_with_masks)} objects detected" cv2.putText(debug_frame, title, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2) source_info = "Mask Source: YOLO Segmentation (DIRECT - No SAM2)" cv2.putText(debug_frame, source_info, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) # Green for YOLO # Add object legend y_offset = 90 for i, detection in enumerate(detections_with_masks[:2]): obj_id = i + 1 color = colors.get(obj_id, (128, 128, 128)) text = f"Object {obj_id}: {'Left Eye' if obj_id == 1 else 'Right Eye'} (YOLO Mask)" cv2.putText(debug_frame, text, (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2) y_offset += 30 # Save debug image success = cv2.imwrite(output_path, debug_frame) if success: logger.info(f"YOLO Mask Debug: Saved debug frame to {output_path}") else: logger.error(f"Failed to save YOLO mask debug frame to {output_path}") return success except Exception as e: logger.error(f"Error creating YOLO mask debug frame: {e}") return False def resolve_detect_segments(detect_segments, total_segments: int) -> List[int]: """ Resolve detect_segments configuration to list of segment indices. Args: detect_segments: Configuration value ("all", list, or None) total_segments: Total number of segments Returns: List of segment indices to process """ if detect_segments == "all" or detect_segments is None: return list(range(total_segments)) elif isinstance(detect_segments, list): # Filter out invalid segment indices valid_segments = [s for s in detect_segments if 0 <= s < total_segments] if len(valid_segments) != len(detect_segments): logger.warning(f"Some segment indices are invalid. Using: {valid_segments}") return valid_segments else: logger.warning(f"Invalid detect_segments format: {detect_segments}. Using all segments.") return list(range(total_segments)) def process_segment_with_separate_eyes(segment_info, detector, sam2_processor, mask_processor, config, previous_left_masks=None, previous_right_masks=None): """ Process a single segment using separate eye processing mode. Split video first, then run YOLO independently on each eye. Args: segment_info: Segment information dictionary detector: YOLO detector instance sam2_processor: SAM2 processor with eye processing enabled mask_processor: Mask processor instance config: Configuration loader instance previous_left_masks: Previous masks for left eye previous_right_masks: Previous masks for right eye Returns: Tuple of (success, left_masks, right_masks) """ segment_idx = segment_info['index'] logger.info(f"VR180 Separate Eyes: Processing segment {segment_idx} (video-split approach)") # Get video properties cap = cv2.VideoCapture(segment_info['video_file']) frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) cap.release() full_frame_shape = (frame_height, frame_width) # Step 1: Split the segment video into left and right eye videos left_eye_video = os.path.join(segment_info['directory'], "left_eye.mp4") right_eye_video = os.path.join(segment_info['directory'], "right_eye.mp4") logger.info(f"VR180 Separate Eyes: Splitting segment video into eye videos") success = sam2_processor.eye_processor.split_video_into_eyes( segment_info['video_file'], left_eye_video, right_eye_video, scale=config.get_inference_scale() ) if not success: logger.error(f"VR180 Separate Eyes: Failed to split video for segment {segment_idx}") return False, None, None # Check if both eye videos were created if not os.path.exists(left_eye_video) or not os.path.exists(right_eye_video): logger.error(f"VR180 Separate Eyes: Eye video files not created for segment {segment_idx}") return False, None, None logger.info(f"VR180 Separate Eyes: Created eye videos - left: {left_eye_video}, right: {right_eye_video}") # Step 2: Run YOLO independently on each eye video left_detections = detector.detect_humans_in_video_first_frame( left_eye_video, scale=1.0 # Already scaled during video splitting ) right_detections = detector.detect_humans_in_video_first_frame( right_eye_video, scale=1.0 # Already scaled during video splitting ) logger.info(f"VR180 Separate Eyes: YOLO detections - left: {len(left_detections)}, right: {len(right_detections)}") # Check if we have YOLO segmentation masks has_yolo_masks = False if detector.supports_segmentation: has_yolo_masks = any(d.get('has_mask', False) for d in (left_detections + right_detections)) if has_yolo_masks: logger.info(f"VR180 Separate Eyes: YOLO segmentation mode - using direct masks instead of bounding boxes") # Save eye-specific debug frames if enabled if config.get('advanced.save_yolo_debug_frames', False) and (left_detections or right_detections): try: # Load first frames from each eye video left_cap = cv2.VideoCapture(left_eye_video) ret_left, left_frame = left_cap.read() left_cap.release() right_cap = cv2.VideoCapture(right_eye_video) ret_right, right_frame = right_cap.read() right_cap.release() if ret_left and ret_right: # Save eye-specific debug frames left_debug_path = os.path.join(segment_info['directory'], "left_eye_debug.jpg") right_debug_path = os.path.join(segment_info['directory'], "right_eye_debug.jpg") detector.save_eye_debug_frames( left_frame, right_frame, left_detections, right_detections, left_debug_path, right_debug_path ) logger.info(f"VR180 Separate Eyes: Saved eye-specific debug frames for segment {segment_idx}") else: logger.warning(f"VR180 Separate Eyes: Could not load eye frames for debug visualization") except Exception as e: logger.warning(f"VR180 Separate Eyes: Failed to create eye debug frames: {e}") # Step 3: Process left eye if detections exist or we have previous masks left_masks = None if left_detections or previous_left_masks: try: left_prompts = None left_initial_masks = None if left_detections: if has_yolo_masks: # YOLO segmentation mode: convert masks to initial masks for SAM2 left_initial_masks = {} for i, detection in enumerate(left_detections): if detection.get('has_mask', False): mask = detection['mask'] left_initial_masks[1] = mask.astype(bool) # Always use obj_id=1 for single eye logger.info(f"VR180 Separate Eyes: Left eye YOLO mask - shape: {mask.shape}, pixels: {np.sum(mask)}") break # Only take the first/best mask for single eye processing if left_initial_masks: logger.info(f"VR180 Separate Eyes: Left eye - using YOLO segmentation masks as initial masks") else: # YOLO detection mode: convert bounding boxes to prompts left_prompts = detector.convert_detections_to_sam2_prompts(left_detections, frame_width // 2) logger.info(f"VR180 Separate Eyes: Left eye - {len(left_prompts)} SAM2 prompts") # Create temporary segment info for left eye processing left_segment_info = segment_info.copy() left_segment_info['video_file'] = left_eye_video left_masks = sam2_processor.process_single_eye_segment( left_segment_info, 'left', left_prompts, left_initial_masks or previous_left_masks, 1.0 # Scale already applied during video splitting ) if left_masks: logger.info(f"VR180 Separate Eyes: Left eye processed - {len(left_masks)} frame masks") else: logger.warning(f"VR180 Separate Eyes: Left eye processing failed") except Exception as e: logger.error(f"VR180 Separate Eyes: Error processing left eye for segment {segment_idx}: {e}") left_masks = None # Step 4: Process right eye if detections exist or we have previous masks right_masks = None if right_detections or previous_right_masks: try: right_prompts = None right_initial_masks = None if right_detections: if has_yolo_masks: # YOLO segmentation mode: convert masks to initial masks for SAM2 right_initial_masks = {} for i, detection in enumerate(right_detections): if detection.get('has_mask', False): mask = detection['mask'] right_initial_masks[1] = mask.astype(bool) # Always use obj_id=1 for single eye logger.info(f"VR180 Separate Eyes: Right eye YOLO mask - shape: {mask.shape}, pixels: {np.sum(mask)}") break # Only take the first/best mask for single eye processing if right_initial_masks: logger.info(f"VR180 Separate Eyes: Right eye - using YOLO segmentation masks as initial masks") else: # YOLO detection mode: convert bounding boxes to prompts right_prompts = detector.convert_detections_to_sam2_prompts(right_detections, frame_width // 2) logger.info(f"VR180 Separate Eyes: Right eye - {len(right_prompts)} SAM2 prompts") # Create temporary segment info for right eye processing right_segment_info = segment_info.copy() right_segment_info['video_file'] = right_eye_video right_masks = sam2_processor.process_single_eye_segment( right_segment_info, 'right', right_prompts, right_initial_masks or previous_right_masks, 1.0 # Scale already applied during video splitting ) if right_masks: logger.info(f"VR180 Separate Eyes: Right eye processed - {len(right_masks)} frame masks") else: logger.warning(f"VR180 Separate Eyes: Right eye processing failed") except Exception as e: logger.error(f"VR180 Separate Eyes: Error processing right eye for segment {segment_idx}: {e}") right_masks = None # Step 5: Check if we got any valid masks if not left_masks and not right_masks: logger.warning(f"VR180 Separate Eyes: Neither eye produced valid masks for segment {segment_idx}") if config.get('processing.enable_greenscreen_fallback', True): logger.info(f"VR180 Separate Eyes: Using greenscreen fallback for segment {segment_idx}") success = mask_processor.process_greenscreen_only_segment( segment_info, green_color=config.get_green_color(), use_nvenc=config.get_use_nvenc(), bitrate=config.get_output_bitrate() ) return success, None, None else: logger.error(f"VR180 Separate Eyes: No masks generated and greenscreen fallback disabled") return False, None, None # Step 6: Combine masks back to full frame format try: logger.info(f"VR180 Separate Eyes: Combining eye masks for segment {segment_idx}") combined_masks = sam2_processor.eye_processor.combine_eye_masks( left_masks, right_masks, full_frame_shape ) if not combined_masks: logger.error(f"VR180 Separate Eyes: Failed to combine eye masks for segment {segment_idx}") return False, left_masks, right_masks # Validate combined masks have reasonable content total_mask_pixels = 0 for frame_idx, frame_masks in combined_masks.items(): for obj_id, mask in frame_masks.items(): if mask is not None: total_mask_pixels += np.sum(mask) if total_mask_pixels == 0: logger.warning(f"VR180 Separate Eyes: Combined masks are empty for segment {segment_idx}") if config.get('processing.enable_greenscreen_fallback', True): logger.info(f"VR180 Separate Eyes: Using greenscreen fallback due to empty masks") success = mask_processor.process_greenscreen_only_segment( segment_info, green_color=config.get_green_color(), use_nvenc=config.get_use_nvenc(), bitrate=config.get_output_bitrate() ) return success, left_masks, right_masks logger.info(f"VR180 Separate Eyes: Combined masks contain {total_mask_pixels} total pixels") except Exception as e: logger.error(f"VR180 Separate Eyes: Error combining eye masks for segment {segment_idx}: {e}") # Try greenscreen fallback if mask combination fails if config.get('processing.enable_greenscreen_fallback', True): logger.info(f"VR180 Separate Eyes: Using greenscreen fallback due to mask combination error") success = mask_processor.process_greenscreen_only_segment( segment_info, green_color=config.get_green_color(), use_nvenc=config.get_use_nvenc(), bitrate=config.get_output_bitrate() ) return success, left_masks, right_masks else: return False, left_masks, right_masks # Step 7: Save combined masks mask_path = os.path.join(segment_info['directory'], "mask.png") sam2_processor.save_final_masks( combined_masks, mask_path, green_color=config.get_green_color(), blue_color=config.get_blue_color() ) # Step 8: Apply green screen and save output video success = mask_processor.process_segment( segment_info, combined_masks, use_nvenc=config.get_use_nvenc(), bitrate=config.get_output_bitrate() ) if success: logger.info(f"VR180 Separate Eyes: Successfully processed segment {segment_idx}") else: logger.error(f"VR180 Separate Eyes: Failed to create output video for segment {segment_idx}") # Clean up temporary eye video files try: if os.path.exists(left_eye_video): os.remove(left_eye_video) if os.path.exists(right_eye_video): os.remove(right_eye_video) logger.debug(f"VR180 Separate Eyes: Cleaned up temporary eye videos for segment {segment_idx}") except Exception as e: logger.warning(f"VR180 Separate Eyes: Failed to clean up temporary eye videos: {e}") return success, left_masks, right_masks async def main_async(): """Main processing pipeline with async optimizations.""" args = parse_arguments() try: # Load configuration config = ConfigLoader(args.config) # Setup logging setup_logging(config.get_log_level(), args.log_file) # Handle status check if args.status: output_dir = config.get_output_directory() input_video = config.get_input_video_path() video_name = os.path.splitext(os.path.basename(input_video))[0] segments_dir = os.path.join(output_dir, f"{video_name}_segments") print_processing_status(segments_dir) return 0 # Handle segment cleanup if args.cleanup_segment is not None: output_dir = config.get_output_directory() input_video = config.get_input_video_path() video_name = os.path.splitext(os.path.basename(input_video))[0] segments_dir = os.path.join(output_dir, f"{video_name}_segments") segment_dir = os.path.join(segments_dir, f"segment_{args.cleanup_segment}") if cleanup_incomplete_segment(segment_dir): logger.info(f"Successfully cleaned up segment {args.cleanup_segment}") return 0 else: logger.error(f"Failed to clean up segment {args.cleanup_segment}") return 1 logger.info("Starting YOLO + SAM2 video processing pipeline") # Validate dependencies if not validate_dependencies(): return 1 # Validate input video exists input_video = config.get_input_video_path() if not os.path.exists(input_video): logger.error(f"Input video not found: {input_video}") return 1 # Setup output directory output_dir = config.get_output_directory() ensure_directory(output_dir) # Step 1: Split video into segments logger.info("Step 1: Splitting video into segments") splitter = VideoSplitter( segment_duration=config.get_segment_duration(), force_keyframes=config.get('video.force_keyframes', True) ) segments_dir, segment_dirs = splitter.split_video(input_video, output_dir) logger.info(f"Created {len(segment_dirs)} segments in {segments_dir}") # Get detailed segment information segments_info = splitter.get_segment_info(segments_dir) # Resolve which segments to process with YOLO detect_segments_config = config.get_detect_segments() detect_segments = resolve_detect_segments(detect_segments_config, len(segments_info)) # Initialize processors once logger.info("Step 2: Initializing YOLO detector") # Get YOLO mode and model paths yolo_mode = config.get('models.yolo_mode', 'detection') detection_model = config.get('models.yolo_detection_model', config.get_yolo_model_path()) segmentation_model = config.get('models.yolo_segmentation_model', None) logger.info(f"YOLO Mode: {yolo_mode}") detector = YOLODetector( detection_model_path=detection_model, segmentation_model_path=segmentation_model, mode=yolo_mode, confidence_threshold=config.get_yolo_confidence(), human_class_id=config.get_human_class_id() ) logger.info("Step 3: Initializing SAM2 processor") # Check if separate eye processing is enabled separate_eye_processing = config.get('processing.separate_eye_processing', False) eye_overlap_pixels = config.get('processing.eye_overlap_pixels', 0) enable_greenscreen_fallback = config.get('processing.enable_greenscreen_fallback', True) # Initialize async preprocessor if enabled async_preprocessor = None if config.get('advanced.enable_background_lowres_generation', False): from core.async_lowres_preprocessor import AsyncLowResPreprocessor max_concurrent = config.get('advanced.max_concurrent_lowres', 3) segments_ahead = config.get('advanced.lowres_segments_ahead', 3) use_ffmpeg = config.get('advanced.use_ffmpeg_lowres', True) async_preprocessor = AsyncLowResPreprocessor( max_concurrent=max_concurrent, segments_ahead=segments_ahead, use_ffmpeg=use_ffmpeg ) logger.info(f"Async low-res preprocessing: ENABLED (max_concurrent={max_concurrent}, segments_ahead={segments_ahead})") else: logger.info("Async low-res preprocessing: DISABLED") if separate_eye_processing: logger.info("VR180 Separate Eye Processing: ENABLED") logger.info(f"Eye overlap pixels: {eye_overlap_pixels}") logger.info(f"Greenscreen fallback: {enable_greenscreen_fallback}") sam2_processor = SAM2Processor( checkpoint_path=config.get_sam2_checkpoint(), config_path=config.get_sam2_config(), vos_optimized=config.get('models.sam2_vos_optimized', False), separate_eye_processing=separate_eye_processing, eye_overlap_pixels=eye_overlap_pixels, async_preprocessor=async_preprocessor ) # Initialize mask processor with quality enhancements mask_quality_config = config.get('mask_processing', {}) mask_processor = MaskProcessor( green_color=config.get_green_color(), blue_color=config.get_blue_color(), mask_quality_config=mask_quality_config ) # Process each segment sequentially (YOLO -> SAM2 -> Render) logger.info("Step 4: Processing segments sequentially") total_humans_detected = 0 # Start background low-res video preprocessing if enabled if async_preprocessor: logger.info("Starting background low-res video preprocessing") async_preprocessor.start_background_preparation( segments_info, config.get_inference_scale(), separate_eye_processing, current_segment=0 ) # Initialize previous masks for separate eye processing previous_left_masks = None previous_right_masks = None for i, segment_info in enumerate(segments_info): segment_idx = segment_info['index'] logger.info(f"Processing segment {segment_idx}/{len(segments_info)-1}") # Start background preparation for upcoming segments if async_preprocessor and i < len(segments_info) - 1: async_preprocessor.start_background_preparation( segments_info, config.get_inference_scale(), separate_eye_processing, current_segment=i ) # Reset temporal history for new segment mask_processor.reset_temporal_history() # Skip if segment output already exists output_video = os.path.join(segment_info['directory'], f"output_{segment_idx}.mp4") if os.path.exists(output_video): logger.info(f"Segment {segment_idx} already processed, skipping") continue # Branch based on processing mode if separate_eye_processing: # Use separate eye processing mode success, left_masks, right_masks = process_segment_with_separate_eyes( segment_info, detector, sam2_processor, mask_processor, config, previous_left_masks, previous_right_masks ) # Update previous masks for next segment previous_left_masks = left_masks previous_right_masks = right_masks if success: logger.info(f"Successfully processed segment {segment_idx} with separate eye processing") else: logger.error(f"Failed to process segment {segment_idx} with separate eye processing") continue # Skip the original processing logic # Determine if we should use YOLO detections or previous masks use_detections = segment_idx in detect_segments # First segment must use detections if segment_idx == 0 and not use_detections: logger.warning(f"First segment must use YOLO detection") use_detections = True # Get YOLO prompts or previous masks yolo_prompts = None previous_masks = None if use_detections: # Run YOLO detection on current segment logger.info(f"Running YOLO detection on segment {segment_idx}") detection_file = os.path.join(segment_info['directory'], "yolo_detections") # Check if detection already exists if os.path.exists(detection_file): logger.info(f"Loading existing YOLO detections for segment {segment_idx}") detections = detector.load_detections_from_file(detection_file) else: # Run YOLO detection on first frame detections = detector.detect_humans_in_video_first_frame( segment_info['video_file'], scale=config.get_inference_scale() ) # Save detections for future runs detector.save_detections_to_file(detections, detection_file) if detections: total_humans_detected += len(detections) logger.info(f"Found {len(detections)} humans in segment {segment_idx}") # Get frame width from video cap = cv2.VideoCapture(segment_info['video_file']) frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) cap.release() yolo_prompts = detector.convert_detections_to_sam2_prompts( detections, frame_width ) # If no right eye detections found, run debug analysis with lower confidence half_frame_width = frame_width // 2 right_eye_detections = [d for d in detections if (d['bbox'][0] + d['bbox'][2]) / 2 >= half_frame_width] if len(right_eye_detections) == 0 and config.get('advanced.save_yolo_debug_frames', False): logger.info(f"VR180 Debug: No right eye detections found, running lower confidence analysis...") # Load first frame for debug analysis cap = cv2.VideoCapture(segment_info['video_file']) ret, debug_frame = cap.read() cap.release() if ret: # Scale frame to match detection scale if config.get_inference_scale() != 1.0: scale = config.get_inference_scale() debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR) # Run debug detection with lower confidence debug_detections = detector.debug_detect_with_lower_confidence(debug_frame, debug_confidence=0.3) # Analyze where these lower confidence detections are debug_right_eye = [d for d in debug_detections if (d['bbox'][0] + d['bbox'][2]) / 2 >= half_frame_width] if len(debug_right_eye) > 0: logger.warning(f"VR180 Debug: Found {len(debug_right_eye)} right eye detections with lower confidence!") for i, det in enumerate(debug_right_eye): logger.warning(f"VR180 Debug: Right eye detection {i+1}: conf={det['confidence']:.3f}, bbox={det['bbox']}") logger.warning(f"VR180 Debug: Consider lowering yolo_confidence from {config.get_yolo_confidence()} to 0.3-0.4") else: logger.info(f"VR180 Debug: No right eye detections found even with confidence 0.3") logger.info(f"VR180 Debug: This confirms person is not visible in right eye view") logger.info(f"Pipeline Debug: Segment {segment_idx} - Generated {len(yolo_prompts)} SAM2 prompts from {len(detections)} YOLO detections") # Save debug frame with detections visualized (if enabled) if config.get('advanced.save_yolo_debug_frames', False): debug_frame_path = os.path.join(segment_info['directory'], "yolo_debug.jpg") # Load first frame for debug visualization cap = cv2.VideoCapture(segment_info['video_file']) ret, debug_frame = cap.read() cap.release() if ret: # Scale frame to match detection scale if config.get_inference_scale() != 1.0: scale = config.get_inference_scale() debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR) detector.save_debug_frame_with_detections(debug_frame, detections, debug_frame_path, yolo_prompts) else: logger.warning(f"Could not load frame for debug visualization in segment {segment_idx}") # Check if we have YOLO masks for debug visualization has_yolo_masks = False if detections and detector.supports_segmentation: has_yolo_masks = any(d.get('has_mask', False) for d in detections) # Generate first frame masks debug (SAM2 or YOLO) first_frame_debug_path = os.path.join(segment_info['directory'], "first_frame_detection.jpg") if has_yolo_masks: logger.info(f"Pipeline Debug: Generating YOLO first frame masks for segment {segment_idx}") # Create YOLO mask debug visualization create_yolo_mask_debug_frame(detections, segment_info['video_file'], first_frame_debug_path, config.get_inference_scale()) else: logger.info(f"Pipeline Debug: Generating SAM2 first frame masks for segment {segment_idx}") sam2_processor.generate_first_frame_debug_masks( segment_info['video_file'], yolo_prompts, first_frame_debug_path, config.get_inference_scale() ) else: logger.warning(f"No humans detected in segment {segment_idx}") # Save debug frame even when no detections (if enabled) if config.get('advanced.save_yolo_debug_frames', False): debug_frame_path = os.path.join(segment_info['directory'], "yolo_debug_no_detections.jpg") # Load first frame for debug visualization cap = cv2.VideoCapture(segment_info['video_file']) ret, debug_frame = cap.read() cap.release() if ret: # Scale frame to match detection scale if config.get_inference_scale() != 1.0: scale = config.get_inference_scale() debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR) # Add "No detections" text overlay cv2.putText(debug_frame, "YOLO: No humans detected", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2) # Red text cv2.imwrite(debug_frame_path, debug_frame) logger.info(f"Saved no-detection debug frame to {debug_frame_path}") else: logger.warning(f"Could not load frame for no-detection debug visualization in segment {segment_idx}") elif segment_idx > 0: # Try to load previous segment mask for j in range(segment_idx - 1, -1, -1): prev_segment_dir = segments_info[j]['directory'] previous_masks = sam2_processor.load_previous_segment_mask(prev_segment_dir) if previous_masks: logger.info(f"Using masks from segment {j} for segment {segment_idx}") break if not yolo_prompts and not previous_masks: logger.error(f"No prompts or previous masks available for segment {segment_idx}") continue # Check if we have YOLO masks and can skip SAM2 (recheck in case detections were loaded from file) if not 'has_yolo_masks' in locals(): has_yolo_masks = False if detections and detector.supports_segmentation: has_yolo_masks = any(d.get('has_mask', False) for d in detections) if has_yolo_masks: logger.info(f"Pipeline Debug: YOLO segmentation provided masks - using as SAM2 initial masks for segment {segment_idx}") # Convert YOLO masks to initial masks for SAM2 cap = cv2.VideoCapture(segment_info['video_file']) frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) cap.release() # Convert YOLO masks to the format expected by SAM2 add_previous_masks_to_predictor yolo_masks_dict = {} for i, detection in enumerate(detections[:2]): # Up to 2 objects if detection.get('has_mask', False): mask = detection['mask'] # Resize mask to match inference scale if config.get_inference_scale() != 1.0: scale = config.get_inference_scale() scaled_height = int(frame_height * scale) scaled_width = int(frame_width * scale) mask = cv2.resize(mask.astype(np.float32), (scaled_width, scaled_height), interpolation=cv2.INTER_NEAREST) mask = mask > 0.5 obj_id = i + 1 # Sequential object IDs yolo_masks_dict[obj_id] = mask.astype(bool) logger.info(f"Pipeline Debug: YOLO mask for Object {obj_id} - shape: {mask.shape}, pixels: {np.sum(mask)}") logger.info(f"Pipeline Debug: Using YOLO masks as SAM2 initial masks - {len(yolo_masks_dict)} objects") # Use traditional SAM2 pipeline with YOLO masks as initial masks previous_masks = yolo_masks_dict yolo_prompts = None # Don't use bounding box prompts when we have masks # Debug what we're passing to SAM2 if yolo_prompts: logger.info(f"Pipeline Debug: Passing {len(yolo_prompts)} YOLO prompts to SAM2 for segment {segment_idx}") for i, prompt in enumerate(yolo_prompts): logger.info(f"Pipeline Debug: Prompt {i+1}: Object {prompt['obj_id']}, bbox={prompt['bbox']}") if previous_masks: logger.info(f"Pipeline Debug: Using {len(previous_masks)} previous masks for segment {segment_idx}") logger.info(f"Pipeline Debug: Previous mask object IDs: {list(previous_masks.keys())}") # Handle mid-segment detection if enabled (works for both detection and segmentation modes) multi_frame_prompts = None if config.get('advanced.enable_mid_segment_detection', False) and (yolo_prompts or has_yolo_masks): logger.info(f"Mid-segment Detection: Enabled for segment {segment_idx}") # Calculate frame indices for re-detection cap = cv2.VideoCapture(segment_info['video_file']) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = cap.get(cv2.CAP_PROP_FPS) or 30.0 cap.release() redetection_interval = config.get('advanced.redetection_interval', 30) max_redetections = config.get('advanced.max_redetections_per_segment', 10) # Generate frame indices: [30, 60, 90, ...] (skip frame 0 since we already have first frame prompts) frame_indices = [] frame_idx = redetection_interval while frame_idx < total_frames and len(frame_indices) < max_redetections: frame_indices.append(frame_idx) frame_idx += redetection_interval if frame_indices: logger.info(f"Mid-segment Detection: Running YOLO on frames {frame_indices} (interval={redetection_interval})") # Run multi-frame detection multi_frame_detections = detector.detect_humans_multi_frame( segment_info['video_file'], frame_indices, scale=config.get_inference_scale() ) # Convert detections to SAM2 prompts (different handling for segmentation vs detection mode) multi_frame_prompts = {} cap = cv2.VideoCapture(segment_info['video_file']) frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) cap.release() for frame_idx, detections in multi_frame_detections.items(): if detections: if has_yolo_masks: # Segmentation mode: convert YOLO masks to SAM2 mask prompts frame_masks = {} for i, detection in enumerate(detections[:2]): # Up to 2 objects if detection.get('has_mask', False): mask = detection['mask'] # Resize mask to match inference scale if config.get_inference_scale() != 1.0: scale = config.get_inference_scale() scaled_height = int(frame_height * scale) scaled_width = int(frame_width * scale) mask = cv2.resize(mask.astype(np.float32), (scaled_width, scaled_height), interpolation=cv2.INTER_NEAREST) mask = mask > 0.5 obj_id = i + 1 # Sequential object IDs frame_masks[obj_id] = mask.astype(bool) logger.debug(f"Mid-segment Detection: Frame {frame_idx}, Object {obj_id} mask - shape: {mask.shape}, pixels: {np.sum(mask)}") if frame_masks: # Store as mask prompts (different format than bbox prompts) multi_frame_prompts[frame_idx] = {'masks': frame_masks} logger.info(f"Mid-segment Detection: Frame {frame_idx} -> {len(frame_masks)} YOLO masks") else: # Detection mode: convert to bounding box prompts (existing logic) prompts = detector.convert_detections_to_sam2_prompts(detections, frame_width) multi_frame_prompts[frame_idx] = prompts logger.info(f"Mid-segment Detection: Frame {frame_idx} -> {len(prompts)} SAM2 prompts") logger.info(f"Mid-segment Detection: Generated prompts for {len(multi_frame_prompts)} frames") else: logger.info(f"Mid-segment Detection: No additional frames to process (segment has {total_frames} frames)") elif config.get('advanced.enable_mid_segment_detection', False): logger.info(f"Mid-segment Detection: Skipped for segment {segment_idx} (no initial YOLO data)") # Process segment with SAM2 logger.info(f"Pipeline Debug: Starting SAM2 processing for segment {segment_idx}") video_segments = sam2_processor.process_single_segment( segment_info, yolo_prompts=yolo_prompts, previous_masks=previous_masks, inference_scale=config.get_inference_scale(), multi_frame_prompts=multi_frame_prompts ) if video_segments is None: logger.error(f"SAM2 processing failed for segment {segment_idx}") continue # Check if SAM2 produced adequate results if len(video_segments) == 0: logger.error(f"SAM2 produced no frames for segment {segment_idx}") continue elif len(video_segments) < 10: # Expected many frames for a 5-second segment logger.warning(f"SAM2 produced very few frames ({len(video_segments)}) for segment {segment_idx} - this may indicate propagation failure") # Debug what SAM2 produced logger.info(f"Pipeline Debug: SAM2 completed for segment {segment_idx}") logger.info(f"Pipeline Debug: Generated masks for {len(video_segments)} frames") if video_segments: # Check first frame to see what objects were tracked first_frame_idx = min(video_segments.keys()) first_frame_objects = video_segments[first_frame_idx] logger.info(f"Pipeline Debug: First frame contains {len(first_frame_objects)} tracked objects") logger.info(f"Pipeline Debug: Tracked object IDs: {list(first_frame_objects.keys())}") for obj_id, mask in first_frame_objects.items(): mask_pixels = np.sum(mask) logger.info(f"Pipeline Debug: Object {obj_id} mask has {mask_pixels} pixels") # Check last frame as well last_frame_idx = max(video_segments.keys()) last_frame_objects = video_segments[last_frame_idx] logger.info(f"Pipeline Debug: Last frame contains {len(last_frame_objects)} tracked objects") logger.info(f"Pipeline Debug: Final object IDs: {list(last_frame_objects.keys())}") # Save final masks for next segment mask_path = os.path.join(segment_info['directory'], "mask.png") sam2_processor.save_final_masks( video_segments, mask_path, green_color=config.get_green_color(), blue_color=config.get_blue_color() ) # Apply green screen and save output video success = mask_processor.process_segment( segment_info, video_segments, use_nvenc=config.get_use_nvenc(), bitrate=config.get_output_bitrate() ) if success: logger.info(f"Successfully processed segment {segment_idx}") else: logger.error(f"Failed to create green screen video for segment {segment_idx}") # Log processing summary logger.info(f"Sequential processing complete. Total humans detected: {total_humans_detected}") # Step 3: Assemble final video logger.info("Step 3: Assembling final video with audio") # Initialize video assembler assembler = VideoAssembler( preserve_audio=config.get_preserve_audio(), use_nvenc=config.get_use_nvenc() ) # Verify all segments are complete all_complete, missing = assembler.verify_segment_completeness(segments_dir) if not all_complete: logger.error(f"Cannot assemble video - missing segments: {missing}") return 1 # Assemble final video final_output = os.path.join(output_dir, config.get_output_filename()) success = assembler.assemble_final_video( segments_dir, input_video, final_output, bitrate=config.get_output_bitrate() ) if success: logger.info(f"Final video saved to: {final_output}") logger.info("Pipeline completed successfully") return 0 except Exception as e: logger.error(f"Pipeline failed: {e}", exc_info=True) return 1 finally: # Cleanup async preprocessor if it was used if async_preprocessor: async_preprocessor.cleanup() logger.debug("Async preprocessor cleanup completed") def main(): """Main entry point - wrapper for async main.""" import asyncio return asyncio.run(main_async()) if __name__ == "__main__": exit_code = main() sys.exit(exit_code)