stereo mask working

2025-07-31 11:13:31 -07:00
parent 0057017ac4
commit b97a3752a7
8 changed files with 1247 additions and 206 deletions
--- a/main.py
+++ b/main.py
@@ -681,138 +681,41 @@ async def main_async():
            previous_masks = None
            
            if use_detections:
-                # Run YOLO detection on current segment
-                logger.info(f"Running YOLO detection on segment {segment_idx}")
-                detection_file = os.path.join(segment_info['directory'], "yolo_detections")
+                # Run YOLO stereo detection and matching on current segment
+                logger.info(f"Running stereo pair detection on segment {segment_idx}")
                
-                # Check if detection already exists
-                if os.path.exists(detection_file):
-                    logger.info(f"Loading existing YOLO detections for segment {segment_idx}")
-                    detections = detector.load_detections_from_file(detection_file)
-                else:
-                    # Run YOLO detection on first frame
-                    detections = detector.detect_humans_in_video_first_frame(
-                        segment_info['video_file'], 
-                        scale=config.get_inference_scale()
-                    )
-                    # Save detections for future runs
-                    detector.save_detections_to_file(detections, detection_file)
-                
-                if detections:
-                    total_humans_detected += len(detections)
-                    logger.info(f"Found {len(detections)} humans in segment {segment_idx}")
-                    
-                    # Get frame width from video
-                    cap = cv2.VideoCapture(segment_info['video_file'])
-                    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-                    cap.release()
-                    
-                    yolo_prompts = detector.convert_detections_to_sam2_prompts(
-                        detections, frame_width
-                    )
-                    
-                    # If no right eye detections found, run debug analysis with lower confidence
-                    half_frame_width = frame_width // 2
-                    right_eye_detections = [d for d in detections if (d['bbox'][0] + d['bbox'][2]) / 2 >= half_frame_width]
-                    
-                    if len(right_eye_detections) == 0 and config.get('advanced.save_yolo_debug_frames', False):
-                        logger.info(f"VR180 Debug: No right eye detections found, running lower confidence analysis...")
-                        
-                        # Load first frame for debug analysis
-                        cap = cv2.VideoCapture(segment_info['video_file'])
-                        ret, debug_frame = cap.read()
-                        cap.release()
-                        
-                        if ret:
-                            # Scale frame to match detection scale
-                            if config.get_inference_scale() != 1.0:
-                                scale = config.get_inference_scale()
-                                debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
-                            
-                            # Run debug detection with lower confidence
-                            debug_detections = detector.debug_detect_with_lower_confidence(debug_frame, debug_confidence=0.3)
-                            
-                            # Analyze where these lower confidence detections are
-                            debug_right_eye = [d for d in debug_detections if (d['bbox'][0] + d['bbox'][2]) / 2 >= half_frame_width]
-                            
-                            if len(debug_right_eye) > 0:
-                                logger.warning(f"VR180 Debug: Found {len(debug_right_eye)} right eye detections with lower confidence!")
-                                for i, det in enumerate(debug_right_eye):
-                                    logger.warning(f"VR180 Debug: Right eye detection {i+1}: conf={det['confidence']:.3f}, bbox={det['bbox']}")
-                                logger.warning(f"VR180 Debug: Consider lowering yolo_confidence from {config.get_yolo_confidence()} to 0.3-0.4")
-                            else:
-                                logger.info(f"VR180 Debug: No right eye detections found even with confidence 0.3")
-                                logger.info(f"VR180 Debug: This confirms person is not visible in right eye view")
-                    
-                    logger.info(f"Pipeline Debug: Segment {segment_idx} - Generated {len(yolo_prompts)} SAM2 prompts from {len(detections)} YOLO detections")
-                    
-                    # Save debug frame with detections visualized (if enabled)
-                    if config.get('advanced.save_yolo_debug_frames', False):
-                        debug_frame_path = os.path.join(segment_info['directory'], "yolo_debug.jpg")
-                        
-                        # Load first frame for debug visualization
-                        cap = cv2.VideoCapture(segment_info['video_file'])
-                        ret, debug_frame = cap.read()
-                        cap.release()
-                        
-                        if ret:
-                            # Scale frame to match detection scale
-                            if config.get_inference_scale() != 1.0:
-                                scale = config.get_inference_scale()
-                                debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
-                            
-                            detector.save_debug_frame_with_detections(debug_frame, detections, debug_frame_path, yolo_prompts)
+                # Load the first frame for detection
+                cap = cv2.VideoCapture(segment_info['video_file'])
+                ret, frame = cap.read()
+                cap.release()
+
+                if not ret:
+                    logger.error(f"Could not read first frame of segment {segment_idx}")
+                    continue
+
+                # Scale frame if needed
+                if config.get_inference_scale() != 1.0:
+                    frame = cv2.resize(frame, None, fx=config.get_inference_scale(), fy=config.get_inference_scale(), interpolation=cv2.INTER_LINEAR)
+
+                yolo_prompts = detector.detect_and_match_stereo_pairs(
+                    frame,
+                    config.get_confidence_reduction_factor(),
+                    config.get_stereo_iou_threshold(),
+                    segment_info,
+                    config.get('advanced.save_yolo_debug_frames', True)
+                )
+
+                if not yolo_prompts:
+                    logger.warning(f"No valid stereo pairs found for segment {segment_idx}. Attempting to use previous segment's mask.")
+                    if segment_idx > 0:
+                        prev_segment_dir = segments_info[segment_idx - 1]['directory']
+                        previous_masks = sam2_processor.load_previous_segment_mask(prev_segment_dir)
+                        if previous_masks:
+                            logger.info(f"Using masks from segment {segment_idx - 1} as fallback.")
                        else:
-                            logger.warning(f"Could not load frame for debug visualization in segment {segment_idx}")
-                        
-                        # Check if we have YOLO masks for debug visualization
-                        has_yolo_masks = False
-                        if detections and detector.supports_segmentation:
-                            has_yolo_masks = any(d.get('has_mask', False) for d in detections)
-                        
-                        # Generate first frame masks debug (SAM2 or YOLO)
-                        first_frame_debug_path = os.path.join(segment_info['directory'], "first_frame_detection.jpg")
-                        
-                        if has_yolo_masks:
-                            logger.info(f"Pipeline Debug: Generating YOLO first frame masks for segment {segment_idx}")
-                            # Create YOLO mask debug visualization
-                            create_yolo_mask_debug_frame(detections, segment_info['video_file'], first_frame_debug_path, config.get_inference_scale())
-                        else:
-                            logger.info(f"Pipeline Debug: Generating SAM2 first frame masks for segment {segment_idx}")
-                            sam2_processor.generate_first_frame_debug_masks(
-                                segment_info['video_file'], 
-                                yolo_prompts, 
-                                first_frame_debug_path,
-                                config.get_inference_scale()
-                            )
-                else:
-                    logger.warning(f"No humans detected in segment {segment_idx}")
-                    
-                    # Save debug frame even when no detections (if enabled)
-                    if config.get('advanced.save_yolo_debug_frames', False):
-                        debug_frame_path = os.path.join(segment_info['directory'], "yolo_debug_no_detections.jpg")
-                        
-                        # Load first frame for debug visualization
-                        cap = cv2.VideoCapture(segment_info['video_file'])
-                        ret, debug_frame = cap.read()
-                        cap.release()
-                        
-                        if ret:
-                            # Scale frame to match detection scale
-                            if config.get_inference_scale() != 1.0:
-                                scale = config.get_inference_scale()
-                                debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
-                            
-                            # Add "No detections" text overlay
-                            cv2.putText(debug_frame, "YOLO: No humans detected", 
-                                      (10, 30), 
-                                      cv2.FONT_HERSHEY_SIMPLEX, 1.0, 
-                                      (0, 0, 255), 2)  # Red text
-                            
-                            cv2.imwrite(debug_frame_path, debug_frame)
-                            logger.info(f"Saved no-detection debug frame to {debug_frame_path}")
-                        else:
-                            logger.warning(f"Could not load frame for no-detection debug visualization in segment {segment_idx}")
+                            logger.error(f"Fallback failed: No previous mask found for segment {segment_idx}.")
+                    else:
+                        logger.error("Cannot use fallback for the first segment.")
            elif segment_idx > 0:
                # Try to load previous segment mask
                for j in range(segment_idx - 1, -1, -1):
@@ -826,43 +729,20 @@ async def main_async():
                logger.error(f"No prompts or previous masks available for segment {segment_idx}")
                continue
            
-            # Check if we have YOLO masks and can skip SAM2 (recheck in case detections were loaded from file)
-            if not 'has_yolo_masks' in locals():
-                has_yolo_masks = False
-                if detections and detector.supports_segmentation:
-                    has_yolo_masks = any(d.get('has_mask', False) for d in detections)
+            # Check if we have YOLO masks from the stereo pair matching and can use them as initial masks for SAM2
+            if yolo_prompts and detector.supports_segmentation:
+                logger.info(f"Pipeline Debug: YOLO segmentation provided matched stereo masks - using as SAM2 initial masks.")
                
-            if has_yolo_masks:
-                logger.info(f"Pipeline Debug: YOLO segmentation provided masks - using as SAM2 initial masks for segment {segment_idx}")
+                # Convert the prompts (which contain masks) into the initial_masks format for SAM2
+                initial_masks = {prompt['obj_id']: prompt['mask'] for prompt in yolo_prompts if 'mask' in prompt}
                
-                # Convert YOLO masks to initial masks for SAM2
-                cap = cv2.VideoCapture(segment_info['video_file'])
-                frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-                frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-                cap.release()
-                
-                # Convert YOLO masks to the format expected by SAM2 add_previous_masks_to_predictor
-                yolo_masks_dict = {}
-                for i, detection in enumerate(detections[:2]):  # Up to 2 objects
-                    if detection.get('has_mask', False):
-                        mask = detection['mask']
-                        # Resize mask to match inference scale
-                        if config.get_inference_scale() != 1.0:
-                            scale = config.get_inference_scale()
-                            scaled_height = int(frame_height * scale)
-                            scaled_width = int(frame_width * scale)
-                            mask = cv2.resize(mask.astype(np.float32), (scaled_width, scaled_height), interpolation=cv2.INTER_NEAREST)
-                            mask = mask > 0.5
-                        
-                        obj_id = i + 1  # Sequential object IDs
-                        yolo_masks_dict[obj_id] = mask.astype(bool)
-                        logger.info(f"Pipeline Debug: YOLO mask for Object {obj_id} - shape: {mask.shape}, pixels: {np.sum(mask)}")
-                
-                logger.info(f"Pipeline Debug: Using YOLO masks as SAM2 initial masks - {len(yolo_masks_dict)} objects")
-                
-                # Use traditional SAM2 pipeline with YOLO masks as initial masks
-                previous_masks = yolo_masks_dict
-                yolo_prompts = None  # Don't use bounding box prompts when we have masks
+                if initial_masks:
+                    # We are providing initial masks, so we should not provide bbox prompts
+                    previous_masks = initial_masks
+                    yolo_prompts = None
+                    logger.info(f"Pipeline Debug: Using {len(previous_masks)} YOLO masks as SAM2 initial masks.")
+                else:
+                    logger.warning("YOLO segmentation mode is on, but no masks were found in the final prompts.")
            
            # Debug what we're passing to SAM2
            if yolo_prompts: