working with segemntation

2025-07-27 13:55:52 -07:00
parent 46363a8a11
commit cd7bc54efe
7 changed files with 1302 additions and 105 deletions
--- a/main.py
+++ b/main.py
@@ -8,6 +8,8 @@ and creating green screen masks with SAM2.
 import os
 import sys
 import argparse
+import cv2
+import numpy as np
 from typing import List

 # Add project root to path
@@ -16,6 +18,9 @@ sys.path.append(os.path.dirname(__file__))
 from core.config_loader import ConfigLoader
 from core.video_splitter import VideoSplitter
 from core.yolo_detector import YOLODetector
+from core.sam2_processor import SAM2Processor
+from core.mask_processor import MaskProcessor
+from core.video_assembler import VideoAssembler
 from utils.logging_utils import setup_logging, get_logger
 from utils.file_utils import ensure_directory
 from utils.status_utils import print_processing_status, cleanup_incomplete_segment
@@ -66,6 +71,100 @@ def validate_dependencies():
        logger.error("Please install requirements: pip install -r requirements.txt")
        return False

+def create_yolo_mask_debug_frame(detections: List[dict], video_path: str, output_path: str, scale: float = 1.0) -> bool:
+    """
+    Create debug visualization for YOLO direct masks.
+    
+    Args:
+        detections: List of YOLO detections with masks
+        video_path: Path to video file
+        output_path: Path to save debug image
+        scale: Scale factor for frame processing
+        
+    Returns:
+        True if debug frame was created successfully
+    """
+    try:
+        # Load first frame
+        cap = cv2.VideoCapture(video_path)
+        ret, original_frame = cap.read()
+        cap.release()
+        
+        if not ret:
+            logger.error("Could not read first frame for YOLO mask debug")
+            return False
+        
+        # Scale frame if needed
+        if scale != 1.0:
+            original_frame = cv2.resize(original_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
+        
+        debug_frame = original_frame.copy()
+        
+        # Define colors for each object
+        colors = {
+            1: (0, 255, 0),    # Green for Object 1 (Left eye)
+            2: (255, 0, 0),    # Blue for Object 2 (Right eye)
+        }
+        
+        # Get detections with masks
+        detections_with_masks = [d for d in detections if d.get('has_mask', False)]
+        
+        # Overlay masks with transparency
+        obj_id = 1
+        for detection in detections_with_masks[:2]:  # Up to 2 objects
+            mask = detection['mask']
+            
+            # Resize mask to match frame if needed
+            if mask.shape != original_frame.shape[:2]:
+                mask = cv2.resize(mask.astype(np.float32), (original_frame.shape[1], original_frame.shape[0]), interpolation=cv2.INTER_NEAREST)
+                mask = mask > 0.5
+            
+            mask = mask.astype(bool)
+            
+            # Apply colored overlay
+            color = colors.get(obj_id, (128, 128, 128))
+            overlay = debug_frame.copy()
+            overlay[mask] = color
+            
+            # Blend with original (30% overlay, 70% original)
+            cv2.addWeighted(overlay, 0.3, debug_frame, 0.7, 0, debug_frame)
+            
+            # Draw outline
+            contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            cv2.drawContours(debug_frame, contours, -1, color, 2)
+            
+            logger.info(f"YOLO Mask Debug: Object {obj_id} mask - shape: {mask.shape}, pixels: {np.sum(mask)}")
+            obj_id += 1
+        
+        # Add title and source info
+        title = f"YOLO Direct Masks: {len(detections_with_masks)} objects detected"
+        cv2.putText(debug_frame, title, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
+        
+        source_info = "Mask Source: YOLO Segmentation (DIRECT - No SAM2)"
+        cv2.putText(debug_frame, source_info, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)  # Green for YOLO
+        
+        # Add object legend
+        y_offset = 90
+        for i, detection in enumerate(detections_with_masks[:2]):
+            obj_id = i + 1
+            color = colors.get(obj_id, (128, 128, 128))
+            text = f"Object {obj_id}: {'Left Eye' if obj_id == 1 else 'Right Eye'} (YOLO Mask)"
+            cv2.putText(debug_frame, text, (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
+            y_offset += 30
+        
+        # Save debug image
+        success = cv2.imwrite(output_path, debug_frame)
+        if success:
+            logger.info(f"YOLO Mask Debug: Saved debug frame to {output_path}")
+        else:
+            logger.error(f"Failed to save YOLO mask debug frame to {output_path}")
+        
+        return success
+        
+    except Exception as e:
+        logger.error(f"Error creating YOLO mask debug frame: {e}")
+        return False
+
 def resolve_detect_segments(detect_segments, total_segments: int) -> List[int]:
    """
    Resolve detect_segments configuration to list of segment indices.
@@ -157,31 +256,394 @@ def main():
        detect_segments_config = config.get_detect_segments()
        detect_segments = resolve_detect_segments(detect_segments_config, len(segments_info))
        
-        # Step 2: Run YOLO detection on specified segments
-        logger.info("Step 2: Running YOLO human detection")
+        # Initialize processors once
+        logger.info("Step 2: Initializing YOLO detector")
+        
+        # Get YOLO mode and model paths
+        yolo_mode = config.get('models.yolo_mode', 'detection')
+        detection_model = config.get('models.yolo_detection_model', config.get_yolo_model_path())
+        segmentation_model = config.get('models.yolo_segmentation_model', None)
+        
+        logger.info(f"YOLO Mode: {yolo_mode}")
+        
        detector = YOLODetector(
-            model_path=config.get_yolo_model_path(),
+            detection_model_path=detection_model,
+            segmentation_model_path=segmentation_model,
+            mode=yolo_mode,
            confidence_threshold=config.get_yolo_confidence(),
            human_class_id=config.get_human_class_id()
        )
        
-        detection_results = detector.process_segments_batch(
-            segments_info, 
-            detect_segments, 
-            scale=config.get_inference_scale()
+        logger.info("Step 3: Initializing SAM2 processor")
+        sam2_processor = SAM2Processor(
+            checkpoint_path=config.get_sam2_checkpoint(),
+            config_path=config.get_sam2_config()
        )
        
-        # Log detection summary
-        total_humans = sum(len(detections) for detections in detection_results.values())
-        logger.info(f"Detected {total_humans} humans across {len(detection_results)} segments")
+        # Initialize mask processor
+        mask_processor = MaskProcessor(
+            green_color=config.get_green_color(),
+            blue_color=config.get_blue_color()
+        )
        
-        # Step 3: Process segments with SAM2 (placeholder for now)
-        logger.info("Step 3: SAM2 processing and green screen generation")
-        logger.info("SAM2 processing module not yet implemented - this is where segment processing would occur")
+        # Process each segment sequentially (YOLO -> SAM2 -> Render)
+        logger.info("Step 4: Processing segments sequentially")
+        total_humans_detected = 0
        
-        # Step 4: Assemble final video (placeholder for now)  
-        logger.info("Step 4: Assembling final video with audio")
-        logger.info("Video assembly module not yet implemented - this is where concatenation and audio copying would occur")
+        for i, segment_info in enumerate(segments_info):
+            segment_idx = segment_info['index']
+            
+            logger.info(f"Processing segment {segment_idx}/{len(segments_info)-1}")
+            
+            # Skip if segment output already exists
+            output_video = os.path.join(segment_info['directory'], f"output_{segment_idx}.mp4")
+            if os.path.exists(output_video):
+                logger.info(f"Segment {segment_idx} already processed, skipping")
+                continue
+            
+            # Determine if we should use YOLO detections or previous masks
+            use_detections = segment_idx in detect_segments
+            
+            # First segment must use detections
+            if segment_idx == 0 and not use_detections:
+                logger.warning(f"First segment must use YOLO detection")
+                use_detections = True
+            
+            # Get YOLO prompts or previous masks
+            yolo_prompts = None
+            previous_masks = None
+            
+            if use_detections:
+                # Run YOLO detection on current segment
+                logger.info(f"Running YOLO detection on segment {segment_idx}")
+                detection_file = os.path.join(segment_info['directory'], "yolo_detections")
+                
+                # Check if detection already exists
+                if os.path.exists(detection_file):
+                    logger.info(f"Loading existing YOLO detections for segment {segment_idx}")
+                    detections = detector.load_detections_from_file(detection_file)
+                else:
+                    # Run YOLO detection on first frame
+                    detections = detector.detect_humans_in_video_first_frame(
+                        segment_info['video_file'], 
+                        scale=config.get_inference_scale()
+                    )
+                    # Save detections for future runs
+                    detector.save_detections_to_file(detections, detection_file)
+                
+                if detections:
+                    total_humans_detected += len(detections)
+                    logger.info(f"Found {len(detections)} humans in segment {segment_idx}")
+                    
+                    # Get frame width from video
+                    cap = cv2.VideoCapture(segment_info['video_file'])
+                    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+                    cap.release()
+                    
+                    yolo_prompts = detector.convert_detections_to_sam2_prompts(
+                        detections, frame_width
+                    )
+                    
+                    # If no right eye detections found, run debug analysis with lower confidence
+                    half_frame_width = frame_width // 2
+                    right_eye_detections = [d for d in detections if (d['bbox'][0] + d['bbox'][2]) / 2 >= half_frame_width]
+                    
+                    if len(right_eye_detections) == 0 and config.get('advanced.save_yolo_debug_frames', False):
+                        logger.info(f"VR180 Debug: No right eye detections found, running lower confidence analysis...")
+                        
+                        # Load first frame for debug analysis
+                        cap = cv2.VideoCapture(segment_info['video_file'])
+                        ret, debug_frame = cap.read()
+                        cap.release()
+                        
+                        if ret:
+                            # Scale frame to match detection scale
+                            if config.get_inference_scale() != 1.0:
+                                scale = config.get_inference_scale()
+                                debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
+                            
+                            # Run debug detection with lower confidence
+                            debug_detections = detector.debug_detect_with_lower_confidence(debug_frame, debug_confidence=0.3)
+                            
+                            # Analyze where these lower confidence detections are
+                            debug_right_eye = [d for d in debug_detections if (d['bbox'][0] + d['bbox'][2]) / 2 >= half_frame_width]
+                            
+                            if len(debug_right_eye) > 0:
+                                logger.warning(f"VR180 Debug: Found {len(debug_right_eye)} right eye detections with lower confidence!")
+                                for i, det in enumerate(debug_right_eye):
+                                    logger.warning(f"VR180 Debug: Right eye detection {i+1}: conf={det['confidence']:.3f}, bbox={det['bbox']}")
+                                logger.warning(f"VR180 Debug: Consider lowering yolo_confidence from {config.get_yolo_confidence()} to 0.3-0.4")
+                            else:
+                                logger.info(f"VR180 Debug: No right eye detections found even with confidence 0.3")
+                                logger.info(f"VR180 Debug: This confirms person is not visible in right eye view")
+                    
+                    logger.info(f"Pipeline Debug: Segment {segment_idx} - Generated {len(yolo_prompts)} SAM2 prompts from {len(detections)} YOLO detections")
+                    
+                    # Save debug frame with detections visualized (if enabled)
+                    if config.get('advanced.save_yolo_debug_frames', False):
+                        debug_frame_path = os.path.join(segment_info['directory'], "yolo_debug.jpg")
+                        
+                        # Load first frame for debug visualization
+                        cap = cv2.VideoCapture(segment_info['video_file'])
+                        ret, debug_frame = cap.read()
+                        cap.release()
+                        
+                        if ret:
+                            # Scale frame to match detection scale
+                            if config.get_inference_scale() != 1.0:
+                                scale = config.get_inference_scale()
+                                debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
+                            
+                            detector.save_debug_frame_with_detections(debug_frame, detections, debug_frame_path, yolo_prompts)
+                        else:
+                            logger.warning(f"Could not load frame for debug visualization in segment {segment_idx}")
+                        
+                        # Check if we have YOLO masks for debug visualization
+                        has_yolo_masks = False
+                        if detections and detector.supports_segmentation:
+                            has_yolo_masks = any(d.get('has_mask', False) for d in detections)
+                        
+                        # Generate first frame masks debug (SAM2 or YOLO)
+                        first_frame_debug_path = os.path.join(segment_info['directory'], "first_frame_detection.jpg")
+                        
+                        if has_yolo_masks:
+                            logger.info(f"Pipeline Debug: Generating YOLO first frame masks for segment {segment_idx}")
+                            # Create YOLO mask debug visualization
+                            create_yolo_mask_debug_frame(detections, segment_info['video_file'], first_frame_debug_path, config.get_inference_scale())
+                        else:
+                            logger.info(f"Pipeline Debug: Generating SAM2 first frame masks for segment {segment_idx}")
+                            sam2_processor.generate_first_frame_debug_masks(
+                                segment_info['video_file'], 
+                                yolo_prompts, 
+                                first_frame_debug_path,
+                                config.get_inference_scale()
+                            )
+                else:
+                    logger.warning(f"No humans detected in segment {segment_idx}")
+                    
+                    # Save debug frame even when no detections (if enabled)
+                    if config.get('advanced.save_yolo_debug_frames', False):
+                        debug_frame_path = os.path.join(segment_info['directory'], "yolo_debug_no_detections.jpg")
+                        
+                        # Load first frame for debug visualization
+                        cap = cv2.VideoCapture(segment_info['video_file'])
+                        ret, debug_frame = cap.read()
+                        cap.release()
+                        
+                        if ret:
+                            # Scale frame to match detection scale
+                            if config.get_inference_scale() != 1.0:
+                                scale = config.get_inference_scale()
+                                debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
+                            
+                            # Add "No detections" text overlay
+                            cv2.putText(debug_frame, "YOLO: No humans detected", 
+                                      (10, 30), 
+                                      cv2.FONT_HERSHEY_SIMPLEX, 1.0, 
+                                      (0, 0, 255), 2)  # Red text
+                            
+                            cv2.imwrite(debug_frame_path, debug_frame)
+                            logger.info(f"Saved no-detection debug frame to {debug_frame_path}")
+                        else:
+                            logger.warning(f"Could not load frame for no-detection debug visualization in segment {segment_idx}")
+            elif segment_idx > 0:
+                # Try to load previous segment mask
+                for j in range(segment_idx - 1, -1, -1):
+                    prev_segment_dir = segments_info[j]['directory']
+                    previous_masks = sam2_processor.load_previous_segment_mask(prev_segment_dir)
+                    if previous_masks:
+                        logger.info(f"Using masks from segment {j} for segment {segment_idx}")
+                        break
+            
+            if not yolo_prompts and not previous_masks:
+                logger.error(f"No prompts or previous masks available for segment {segment_idx}")
+                continue
+            
+            # Check if we have YOLO masks and can skip SAM2 (recheck in case detections were loaded from file)
+            if not 'has_yolo_masks' in locals():
+                has_yolo_masks = False
+                if detections and detector.supports_segmentation:
+                    has_yolo_masks = any(d.get('has_mask', False) for d in detections)
+                
+            if has_yolo_masks:
+                logger.info(f"Pipeline Debug: YOLO segmentation provided masks - using as SAM2 initial masks for segment {segment_idx}")
+                
+                # Convert YOLO masks to initial masks for SAM2
+                cap = cv2.VideoCapture(segment_info['video_file'])
+                frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+                frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+                cap.release()
+                
+                # Convert YOLO masks to the format expected by SAM2 add_previous_masks_to_predictor
+                yolo_masks_dict = {}
+                for i, detection in enumerate(detections[:2]):  # Up to 2 objects
+                    if detection.get('has_mask', False):
+                        mask = detection['mask']
+                        # Resize mask to match inference scale
+                        if config.get_inference_scale() != 1.0:
+                            scale = config.get_inference_scale()
+                            scaled_height = int(frame_height * scale)
+                            scaled_width = int(frame_width * scale)
+                            mask = cv2.resize(mask.astype(np.float32), (scaled_width, scaled_height), interpolation=cv2.INTER_NEAREST)
+                            mask = mask > 0.5
+                        
+                        obj_id = i + 1  # Sequential object IDs
+                        yolo_masks_dict[obj_id] = mask.astype(bool)
+                        logger.info(f"Pipeline Debug: YOLO mask for Object {obj_id} - shape: {mask.shape}, pixels: {np.sum(mask)}")
+                
+                logger.info(f"Pipeline Debug: Using YOLO masks as SAM2 initial masks - {len(yolo_masks_dict)} objects")
+                
+                # Use traditional SAM2 pipeline with YOLO masks as initial masks
+                previous_masks = yolo_masks_dict
+                yolo_prompts = None  # Don't use bounding box prompts when we have masks
+            
+            # Debug what we're passing to SAM2
+            if yolo_prompts:
+                logger.info(f"Pipeline Debug: Passing {len(yolo_prompts)} YOLO prompts to SAM2 for segment {segment_idx}")
+                for i, prompt in enumerate(yolo_prompts):
+                    logger.info(f"Pipeline Debug: Prompt {i+1}: Object {prompt['obj_id']}, bbox={prompt['bbox']}")
+            
+            if previous_masks:
+                logger.info(f"Pipeline Debug: Using {len(previous_masks)} previous masks for segment {segment_idx}")
+                logger.info(f"Pipeline Debug: Previous mask object IDs: {list(previous_masks.keys())}")
+        
+            # Handle mid-segment detection if enabled (only when using YOLO prompts, not masks)
+            multi_frame_prompts = None
+            if config.get('advanced.enable_mid_segment_detection', False) and yolo_prompts:
+                logger.info(f"Mid-segment Detection: Enabled for segment {segment_idx}")
+                
+                # Calculate frame indices for re-detection
+                cap = cv2.VideoCapture(segment_info['video_file'])
+                total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
+                cap.release()
+                
+                redetection_interval = config.get('advanced.redetection_interval', 30)
+                max_redetections = config.get('advanced.max_redetections_per_segment', 10)
+                
+                # Generate frame indices: [30, 60, 90, ...] (skip frame 0 since we already have first frame prompts)
+                frame_indices = []
+                frame_idx = redetection_interval
+                while frame_idx < total_frames and len(frame_indices) < max_redetections:
+                    frame_indices.append(frame_idx)
+                    frame_idx += redetection_interval
+                
+                if frame_indices:
+                    logger.info(f"Mid-segment Detection: Running YOLO on frames {frame_indices} (interval={redetection_interval})")
+                    
+                    # Run multi-frame detection
+                    multi_frame_detections = detector.detect_humans_multi_frame(
+                        segment_info['video_file'],
+                        frame_indices,
+                        scale=config.get_inference_scale()
+                    )
+                    
+                    # Convert detections to SAM2 prompts
+                    multi_frame_prompts = {}
+                    cap = cv2.VideoCapture(segment_info['video_file'])
+                    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+                    cap.release()
+                    
+                    for frame_idx, detections in multi_frame_detections.items():
+                        if detections:
+                            prompts = detector.convert_detections_to_sam2_prompts(detections, frame_width)
+                            multi_frame_prompts[frame_idx] = prompts
+                            logger.info(f"Mid-segment Detection: Frame {frame_idx} -> {len(prompts)} SAM2 prompts")
+                    
+                    logger.info(f"Mid-segment Detection: Generated prompts for {len(multi_frame_prompts)} frames")
+                else:
+                    logger.info(f"Mid-segment Detection: No additional frames to process (segment has {total_frames} frames)")
+            elif config.get('advanced.enable_mid_segment_detection', False):
+                logger.info(f"Mid-segment Detection: Skipped for segment {segment_idx} (no initial YOLO prompts)")
+            
+            # Process segment with SAM2
+            logger.info(f"Pipeline Debug: Starting SAM2 processing for segment {segment_idx}")
+            video_segments = sam2_processor.process_single_segment(
+                segment_info,
+                yolo_prompts=yolo_prompts,
+                previous_masks=previous_masks,
+                inference_scale=config.get_inference_scale(),
+                multi_frame_prompts=multi_frame_prompts
+            )
+            
+            if video_segments is None:
+                logger.error(f"SAM2 processing failed for segment {segment_idx}")
+                continue
+            
+            # Debug what SAM2 produced
+            logger.info(f"Pipeline Debug: SAM2 completed for segment {segment_idx}")
+            logger.info(f"Pipeline Debug: Generated masks for {len(video_segments)} frames")
+            
+            if video_segments:
+                # Check first frame to see what objects were tracked
+                first_frame_idx = min(video_segments.keys())
+                first_frame_objects = video_segments[first_frame_idx]
+                logger.info(f"Pipeline Debug: First frame contains {len(first_frame_objects)} tracked objects")
+                logger.info(f"Pipeline Debug: Tracked object IDs: {list(first_frame_objects.keys())}")
+                
+                for obj_id, mask in first_frame_objects.items():
+                    mask_pixels = np.sum(mask)
+                    logger.info(f"Pipeline Debug: Object {obj_id} mask has {mask_pixels} pixels")
+                
+                # Check last frame as well
+                last_frame_idx = max(video_segments.keys())
+                last_frame_objects = video_segments[last_frame_idx]
+                logger.info(f"Pipeline Debug: Last frame contains {len(last_frame_objects)} tracked objects")
+                logger.info(f"Pipeline Debug: Final object IDs: {list(last_frame_objects.keys())}")
+            
+            # Save final masks for next segment
+            mask_path = os.path.join(segment_info['directory'], "mask.png")
+            sam2_processor.save_final_masks(
+                video_segments, 
+                mask_path,
+                green_color=config.get_green_color(),
+                blue_color=config.get_blue_color()
+            )
+            
+            # Apply green screen and save output video
+            success = mask_processor.process_segment(
+                segment_info,
+                video_segments,
+                use_nvenc=config.get_use_nvenc(),
+                bitrate=config.get_output_bitrate()
+            )
+            
+            if success:
+                logger.info(f"Successfully processed segment {segment_idx}")
+            else:
+                logger.error(f"Failed to create green screen video for segment {segment_idx}")
+        
+        # Log processing summary
+        logger.info(f"Sequential processing complete. Total humans detected: {total_humans_detected}")
+        
+        # Step 3: Assemble final video  
+        logger.info("Step 3: Assembling final video with audio")
+        
+        # Initialize video assembler
+        assembler = VideoAssembler(
+            preserve_audio=config.get_preserve_audio(),
+            use_nvenc=config.get_use_nvenc()
+        )
+        
+        # Verify all segments are complete
+        all_complete, missing = assembler.verify_segment_completeness(segments_dir)
+        
+        if not all_complete:
+            logger.error(f"Cannot assemble video - missing segments: {missing}")
+            return 1
+        
+        # Assemble final video
+        final_output = os.path.join(output_dir, config.get_output_filename())
+        
+        success = assembler.assemble_final_video(
+            segments_dir,
+            input_video,
+            final_output,
+            bitrate=config.get_output_bitrate()
+        )
+        
+        if success:
+            logger.info(f"Final video saved to: {final_output}")
        
        logger.info("Pipeline completed successfully")
        return 0