stereo mask working

2025-07-31 11:13:31 -07:00
parent 0057017ac4
commit b97a3752a7
8 changed files with 1247 additions and 206 deletions
--- a/core/yolo_detector.py
+++ b/core/yolo_detector.py
@@ -61,26 +61,36 @@ class YOLODetector:
            logger.error(f"Failed to load YOLO model: {e}")
            raise
    
-    def detect_humans_in_frame(self, frame: np.ndarray) -> List[Dict[str, Any]]:
+    def detect_humans_in_frame(self, frame: np.ndarray, confidence_override: Optional[float] = None, 
+                              validate_with_detection: bool = False) -> List[Dict[str, Any]]:
        """
        Detect humans in a single frame using YOLO.
        
        Args:
            frame: Input frame (BGR format from OpenCV)
+            confidence_override: Optional confidence to use instead of the default
+            validate_with_detection: If True and in segmentation mode, validate masks against detection bboxes
            
        Returns:
            List of human detection dictionaries with bbox, confidence, and optionally masks
        """
        # Run YOLO detection/segmentation
-        results = self.model(frame, conf=self.confidence_threshold, verbose=False)
+        confidence = confidence_override if confidence_override is not None else self.confidence_threshold
+        results = self.model(frame, conf=confidence, verbose=False)
        
        human_detections = []
        
        # Process results
-        for result in results:
+        for result_idx, result in enumerate(results):
            boxes = result.boxes
            masks = result.masks if hasattr(result, 'masks') and result.masks is not None else None
            
+            logger.debug(f"YOLO Result {result_idx}: boxes={boxes is not None}, masks={masks is not None}")
+            if boxes is not None:
+                logger.debug(f"  Found {len(boxes)} total boxes")
+            if masks is not None:
+                logger.debug(f"  Found {len(masks.data)} total masks")
+            
            if boxes is not None:
                for i, box in enumerate(boxes):
                    # Get class ID
@@ -101,18 +111,30 @@ class YOLODetector:
                        
                        # Extract mask if available (segmentation mode)
                        if masks is not None and i < len(masks.data):
-                            mask_data = masks.data[i].cpu().numpy()  # Get mask for this detection
+                            # Resize the raw mask to match the input frame dimensions
+                            raw_mask = masks.data[i].cpu().numpy()
+                            resized_mask = cv2.resize(raw_mask, (frame.shape[1], frame.shape[0]), interpolation=cv2.INTER_NEAREST)
+                            
+                            mask_area = np.sum(resized_mask > 0.5)
                            detection['has_mask'] = True
-                            detection['mask'] = mask_data
-                            logger.debug(f"YOLO Segmentation: Detected human with mask - conf={conf:.2f}, mask_shape={mask_data.shape}")
+                            detection['mask'] = resized_mask
+                            logger.info(f"YOLO Segmentation: Human {len(human_detections)} - conf={conf:.3f}, raw_mask_shape={raw_mask.shape}, frame_shape={frame.shape}, resized_mask_shape={resized_mask.shape}, mask_area={mask_area}px")
                        else:
-                            logger.debug(f"YOLO Detection: Detected human with bbox - conf={conf:.2f}, bbox={coords}")
+                            logger.debug(f"YOLO Detection: Human {len(human_detections)} - conf={conf:.3f}, bbox={coords} (no mask)")
                        
                        human_detections.append(detection)
+                    else:
+                        logger.debug(f"YOLO: Skipping non-human detection (class {cls})")
        
        if self.supports_segmentation:
            masks_found = sum(1 for d in human_detections if d['has_mask'])
            logger.info(f"YOLO Segmentation: Found {len(human_detections)} humans, {masks_found} with masks")
+            
+            # Optional validation with detection model
+            if validate_with_detection and masks_found > 0:
+                logger.info("Validating segmentation masks with detection model...")
+                validated_detections = self._validate_masks_with_detection(frame, human_detections, confidence_override)
+                return validated_detections
        else:
            logger.debug(f"YOLO Detection: Found {len(human_detections)} humans with bounding boxes")
        
@@ -1028,4 +1050,508 @@ class YOLODetector:
            
        except Exception as e:
            logger.error(f"Error creating {eye_side} eye debug frame: {e}")
-            return False
+            return False
+
+    def _calculate_iou(self, mask1: np.ndarray, mask2: np.ndarray) -> float:
+        """Calculate Intersection over Union for two masks of the same size."""
+        if mask1.shape != mask2.shape:
+            return 0.0
+
+        intersection = np.logical_and(mask1, mask2).sum()
+        union = np.logical_or(mask1, mask2).sum()
+        
+        return intersection / union if union > 0 else 0.0
+    
+    def _calculate_stereo_similarity(self, left_mask: np.ndarray, right_mask: np.ndarray, 
+                                   left_bbox: np.ndarray, right_bbox: np.ndarray, 
+                                   left_idx: int = -1, right_idx: int = -1) -> float:
+        """
+        Calculate stereo similarity for VR180 masks using spatial and size features.
+        For VR180, left and right eye views won't overlap much, so we use other metrics.
+        """
+        logger.info(f"  Starting similarity calculation L{left_idx} vs R{right_idx}")
+        logger.info(f"    Left mask: shape={left_mask.shape}, dtype={left_mask.dtype}, min={left_mask.min()}, max={left_mask.max()}")
+        logger.info(f"    Right mask: shape={right_mask.shape}, dtype={right_mask.dtype}, min={right_mask.min()}, max={right_mask.max()}")
+        logger.info(f"    Left bbox: {left_bbox}")
+        logger.info(f"    Right bbox: {right_bbox}")
+        if left_mask.shape != right_mask.shape:
+            logger.info(f"  L{left_idx} vs R{right_idx}: Shape mismatch - {left_mask.shape} vs {right_mask.shape} - attempting to resize")
+            
+            # Try to resize the smaller mask to match the larger one
+            if left_mask.size < right_mask.size:
+                left_mask = cv2.resize(left_mask.astype(np.float32), (right_mask.shape[1], right_mask.shape[0]), interpolation=cv2.INTER_NEAREST)
+                left_mask = left_mask > 0.5
+                logger.info(f"  Resized left mask to {left_mask.shape}")
+            else:
+                right_mask = cv2.resize(right_mask.astype(np.float32), (left_mask.shape[1], left_mask.shape[0]), interpolation=cv2.INTER_NEAREST)
+                right_mask = right_mask > 0.5
+                logger.info(f"  Resized right mask to {right_mask.shape}")
+            
+            if left_mask.shape != right_mask.shape:
+                logger.warning(f"  L{left_idx} vs R{right_idx}: Still shape mismatch after resize - {left_mask.shape} vs {right_mask.shape}")
+                return 0.0
+        
+        # 1. Size similarity (area ratio)
+        left_area = np.sum(left_mask)
+        right_area = np.sum(right_mask)
+        
+        if left_area == 0 or right_area == 0:
+            logger.debug(f"  L{left_idx} vs R{right_idx}: Zero area - left={left_area}, right={right_area}")
+            return 0.0
+            
+        area_ratio = min(left_area, right_area) / max(left_area, right_area)
+        
+        # 2. Vertical position similarity (y-coordinates should be similar)
+        left_center_y = (left_bbox[1] + left_bbox[3]) / 2
+        right_center_y = (right_bbox[1] + right_bbox[3]) / 2
+        
+        height = left_mask.shape[0]
+        y_diff = abs(left_center_y - right_center_y) / height
+        y_similarity = max(0, 1.0 - y_diff * 2)  # Penalize vertical misalignment
+        
+        # 3. Height similarity (bounding box heights should be similar)
+        left_height = left_bbox[3] - left_bbox[1]
+        right_height = right_bbox[3] - right_bbox[1]
+        
+        if left_height == 0 or right_height == 0:
+            height_ratio = 0.0
+        else:
+            height_ratio = min(left_height, right_height) / max(left_height, right_height)
+        
+        # 4. Aspect ratio similarity
+        left_width = left_bbox[2] - left_bbox[0]
+        right_width = right_bbox[2] - right_bbox[0]
+        
+        if left_width == 0 or right_width == 0 or left_height == 0 or right_height == 0:
+            aspect_similarity = 0.0
+        else:
+            left_aspect = left_width / left_height
+            right_aspect = right_width / right_height
+            aspect_diff = abs(left_aspect - right_aspect) / max(left_aspect, right_aspect)
+            aspect_similarity = max(0, 1.0 - aspect_diff)
+        
+        # Combine metrics with weights
+        similarity = (
+            area_ratio * 0.3 +           # 30% weight on size similarity
+            y_similarity * 0.4 +         # 40% weight on vertical alignment
+            height_ratio * 0.2 +         # 20% weight on height similarity
+            aspect_similarity * 0.1      # 10% weight on aspect ratio
+        )
+        
+        # Detailed logging for each comparison
+        logger.info(f"  L{left_idx} vs R{right_idx}: area_ratio={area_ratio:.3f} (L={left_area}px, R={right_area}px), "
+                   f"y_sim={y_similarity:.3f} (L_y={left_center_y:.1f}, R_y={right_center_y:.1f}, diff={y_diff:.3f}), "
+                   f"height_ratio={height_ratio:.3f} (L_h={left_height:.1f}, R_h={right_height:.1f}), "
+                   f"aspect_sim={aspect_similarity:.3f} (L_asp={left_aspect:.2f}, R_asp={right_aspect:.2f}), "
+                   f"FINAL_SIMILARITY={similarity:.3f}")
+        
+        return similarity
+
+    def _find_matching_mask_pairs(self, left_masks: List[Dict[str, Any]], right_masks: List[Dict[str, Any]], 
+                                  similarity_threshold: float) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """Find the best matching pairs of masks between left and right eyes using stereo similarity."""
+        
+        logger.info(f"Starting stereo mask matching with {len(left_masks)} left masks and {len(right_masks)} right masks.")
+
+        if not left_masks or not right_masks:
+            return [], left_masks, right_masks
+
+        # 1. Calculate all similarity scores for every possible pair
+        possible_pairs = []
+        logger.info("--- Calculating all possible stereo similarity pairs ---")
+        
+        # First, log details about each mask
+        logger.info(f"LEFT EYE MASKS ({len(left_masks)} total):")
+        for i, left_detection in enumerate(left_masks):
+            bbox = left_detection['bbox']
+            mask_area = np.sum(left_detection['mask'])
+            conf = left_detection['confidence']
+            logger.info(f"  L{i}: bbox=[{bbox[0]:.1f},{bbox[1]:.1f},{bbox[2]:.1f},{bbox[3]:.1f}], area={mask_area}px, conf={conf:.3f}")
+        
+        logger.info(f"RIGHT EYE MASKS ({len(right_masks)} total):")
+        for j, right_detection in enumerate(right_masks):
+            bbox = right_detection['bbox']
+            mask_area = np.sum(right_detection['mask'])
+            conf = right_detection['confidence']
+            logger.info(f"  R{j}: bbox=[{bbox[0]:.1f},{bbox[1]:.1f},{bbox[2]:.1f},{bbox[3]:.1f}], area={mask_area}px, conf={conf:.3f}")
+        
+        logger.info("--- Stereo Similarity Calculations ---")
+        for i, left_detection in enumerate(left_masks):
+            for j, right_detection in enumerate(right_masks):
+                try:
+                    # Use stereo similarity instead of IOU for VR180
+                    similarity = self._calculate_stereo_similarity(
+                        left_detection['mask'], right_detection['mask'],
+                        left_detection['bbox'], right_detection['bbox'],
+                        left_idx=i, right_idx=j
+                    )
+                    
+                    if similarity > similarity_threshold:
+                        possible_pairs.append({'left_idx': i, 'right_idx': j, 'similarity': similarity})
+                        logger.info(f"  ✓ L{i} vs R{j}: ABOVE THRESHOLD ({similarity:.4f} > {similarity_threshold:.4f})")
+                    else:
+                        logger.info(f"  ✗ L{i} vs R{j}: BELOW THRESHOLD ({similarity:.4f} <= {similarity_threshold:.4f})")
+                except Exception as e:
+                    logger.error(f"  ERROR L{i} vs R{j}: Exception in similarity calculation: {e}")
+                    similarity = 0.0
+
+        # 2. Sort pairs by similarity score in descending order to prioritize the best matches
+        possible_pairs.sort(key=lambda x: x['similarity'], reverse=True)
+        
+        logger.debug("--- Sorted similarity pairs above threshold ---")
+        for pair in possible_pairs:
+            logger.debug(f"  Pair (L{pair['left_idx']}, R{pair['right_idx']}) - Similarity: {pair['similarity']:.4f}")
+
+        matched_pairs = []
+        matched_left_indices = set()
+        matched_right_indices = set()
+
+        # 3. Iterate through sorted pairs and greedily select the best available ones
+        logger.debug("--- Selecting best pairs ---")
+        for pair in possible_pairs:
+            left_idx, right_idx = pair['left_idx'], pair['right_idx']
+            
+            if left_idx not in matched_left_indices and right_idx not in matched_right_indices:
+                logger.info(f"  MATCH FOUND: (L{left_idx}, R{right_idx}) with Similarity {pair['similarity']:.4f}")
+                matched_pairs.append({
+                    'left_mask': left_masks[left_idx],
+                    'right_mask': right_masks[right_idx],
+                    'similarity': pair['similarity']  # Changed from 'iou' to 'similarity'
+                })
+                matched_left_indices.add(left_idx)
+                matched_right_indices.add(right_idx)
+            else:
+                logger.debug(f"  Skipping pair (L{left_idx}, R{right_idx}) because one mask is already matched.")
+
+        # 4. Identify unmatched (orphan) masks
+        unmatched_left = [mask for i, mask in enumerate(left_masks) if i not in matched_left_indices]
+        unmatched_right = [mask for i, mask in enumerate(right_masks) if i not in matched_right_indices]
+        
+        logger.info(f"Matching complete: Found {len(matched_pairs)} pairs. Left orphans: {len(unmatched_left)}, Right orphans: {len(unmatched_right)}.")
+
+        return matched_pairs, unmatched_left, unmatched_right
+
+    def _save_stereo_agreement_debug_frame(self, left_frame: np.ndarray, right_frame: np.ndarray, 
+                                           left_detections: List[Dict[str, Any]], right_detections: List[Dict[str, Any]], 
+                                           matched_pairs: List[Dict[str, Any]], unmatched_left: List[Dict[str, Any]], 
+                                           unmatched_right: List[Dict[str, Any]], output_path: str, title: str):
+        """Save a debug frame visualizing the stereo mask agreement process."""
+        try:
+            # Create a combined image
+            h, w, _ = left_frame.shape
+            combined_frame = np.hstack((left_frame, right_frame))
+
+            def get_centroid(mask):
+                m = cv2.moments(mask.astype(np.uint8), binaryImage=True)
+                return (int(m["m10"] / m["m00"]), int(m["m01"] / m["m00"])) if m["m00"] != 0 else (0,0)
+
+            def draw_label(frame, text, pos, color):
+                # Draw a black background rectangle
+                cv2.rectangle(frame, (pos[0], pos[1] - 14), (pos[0] + len(text) * 8, pos[1] + 5), (0,0,0), -1)
+                # Draw the text
+                cv2.putText(frame, text, pos, cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
+
+            # --- Draw ALL Masks First (to ensure every mask gets a label) ---
+            logger.info(f"Debug Frame: Drawing {len(left_detections)} left masks and {len(right_detections)} right masks")
+            
+            # Draw all left detections first
+            for i, detection in enumerate(left_detections):
+                mask = detection['mask']
+                mask_area = np.sum(mask > 0.5)
+                
+                # Skip tiny masks that are likely noise
+                if mask_area < 100:  # Less than 100 pixels
+                    logger.debug(f"Skipping tiny left mask L{i} with area {mask_area}px")
+                    continue
+                
+                contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                if contours:
+                    cv2.drawContours(combined_frame, contours, -1, (0, 0, 255), 2)  # Default red for unmatched
+                    c = get_centroid(mask)
+                    if c[0] > 0 and c[1] > 0:  # Valid centroid
+                        draw_label(combined_frame, f"L{i}", c, (0, 0, 255))
+                        logger.debug(f"Drew left mask L{i} at centroid {c}, area={mask_area}px")
+
+            # Draw all right detections
+            for i, detection in enumerate(right_detections):
+                mask = detection['mask']
+                mask_area = np.sum(mask > 0.5)
+                
+                # Skip tiny masks that are likely noise
+                if mask_area < 100:  # Less than 100 pixels
+                    logger.debug(f"Skipping tiny right mask R{i} with area {mask_area}px")
+                    continue
+                
+                contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                if contours:
+                    for cnt in contours:
+                        cnt[:, :, 0] += w
+                    cv2.drawContours(combined_frame, contours, -1, (0, 0, 255), 2)  # Default red for unmatched
+                    c_shifted = get_centroid(mask)
+                    c = (c_shifted[0] + w, c_shifted[1])
+                    if c[0] > w and c[1] > 0:  # Valid centroid in right half
+                        draw_label(combined_frame, f"R{i}", c, (0, 0, 255))
+                        logger.debug(f"Drew right mask R{i} at centroid {c}, area={mask_area}px")
+
+            # --- Now Overdraw Matched Pairs in Green ---
+            for pair in matched_pairs:
+                left_mask = pair['left_mask']['mask']
+                right_mask = pair['right_mask']['mask']
+                
+                # Find the indices from the stored pair data (should be available from matching)
+                left_idx = None
+                right_idx = None
+                
+                # Find indices by comparing mask properties
+                for i, det in enumerate(left_detections):
+                    if (np.array_equal(det['bbox'], pair['left_mask']['bbox']) and 
+                        abs(det['confidence'] - pair['left_mask']['confidence']) < 0.001):
+                        left_idx = i
+                        break
+                        
+                for i, det in enumerate(right_detections):
+                    if (np.array_equal(det['bbox'], pair['right_mask']['bbox']) and 
+                        abs(det['confidence'] - pair['right_mask']['confidence']) < 0.001):
+                        right_idx = i
+                        break
+                
+                # Draw left mask in green (matched)
+                contours, _ = cv2.findContours(left_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                if contours:
+                    cv2.drawContours(combined_frame, contours, -1, (0, 255, 0), 3)  # Thicker green line
+                    c1 = get_centroid(left_mask)
+                    if c1[0] > 0 and c1[1] > 0:
+                        draw_label(combined_frame, f"L{left_idx if left_idx is not None else '?'}", c1, (0, 255, 0))
+                
+                # Draw right mask in green (matched)
+                contours, _ = cv2.findContours(right_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                if contours:
+                    for cnt in contours:
+                        cnt[:, :, 0] += w
+                    cv2.drawContours(combined_frame, contours, -1, (0, 255, 0), 3)  # Thicker green line
+                    c2_shifted = get_centroid(right_mask)
+                    c2 = (c2_shifted[0] + w, c2_shifted[1])
+                    if c2[0] > w and c2[1] > 0:
+                        draw_label(combined_frame, f"R{right_idx if right_idx is not None else '?'}", c2, (0, 255, 0))
+
+                        # Draw line connecting centroids and similarity score
+                        cv2.line(combined_frame, c1, c2, (0, 255, 0), 2)
+                        similarity_text = f"Sim: {pair.get('similarity', pair.get('iou', 0)):.2f}"
+                        cv2.putText(combined_frame, similarity_text, (c1[0] + 10, c1[1] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
+
+            # Add title
+            cv2.putText(combined_frame, title, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
+            
+            cv2.imwrite(output_path, combined_frame)
+            logger.info(f"Saved stereo agreement debug frame to {output_path}")
+        except Exception as e:
+            logger.error(f"Failed to create stereo agreement debug frame: {e}")
+
+    def detect_and_match_stereo_pairs(self, frame: np.ndarray, confidence_reduction_factor: float, 
+                                      stereo_similarity_threshold: float, segment_info: dict, save_debug_frames: bool) -> List[Dict[str, Any]]:
+        """The main method to detect and match stereo mask pairs."""
+        frame_height, frame_width, _ = frame.shape
+        half_width = frame_width // 2
+        
+        left_eye_frame = frame[:, :half_width]
+        right_eye_frame = frame[:, half_width:half_width*2]  # Ensure exact same width
+
+        logger.info(f"VR180 Frame Split: Original={frame.shape}, Left={left_eye_frame.shape}, Right={right_eye_frame.shape}")
+
+        # Initial detection with validation
+        logger.info(f"Running initial stereo detection at {self.confidence_threshold} confidence.")
+        left_detections = self.detect_humans_in_frame(left_eye_frame, validate_with_detection=True)
+        right_detections = self.detect_humans_in_frame(right_eye_frame, validate_with_detection=True)
+        
+        # Convert IOU threshold to similarity threshold (IOU 0.5 ≈ similarity 0.3)
+        similarity_threshold = max(0.2, stereo_similarity_threshold * 0.6)
+        matched_pairs, unmatched_left, unmatched_right = self._find_matching_mask_pairs(left_detections, right_detections, similarity_threshold)
+        
+        if save_debug_frames:
+            debug_path = os.path.join(segment_info['directory'], "yolo_stereo_agreement_initial.jpg")
+            title = f"Initial Attempt (Conf: {self.confidence_threshold:.2f}) - {len(matched_pairs)} Pairs"
+            self._save_stereo_agreement_debug_frame(left_eye_frame, right_eye_frame, left_detections, right_detections, matched_pairs, unmatched_left, unmatched_right, debug_path, title)
+
+        # Retry with lower confidence if no pairs found
+        if not matched_pairs:
+            new_confidence = self.confidence_threshold * confidence_reduction_factor
+            logger.info(f"No valid pairs found. Reducing confidence to {new_confidence:.2f} and retrying.")
+            
+            left_detections = self.detect_humans_in_frame(left_eye_frame, confidence_override=new_confidence, validate_with_detection=True)
+            right_detections = self.detect_humans_in_frame(right_eye_frame, confidence_override=new_confidence, validate_with_detection=True)
+            
+            matched_pairs, unmatched_left, unmatched_right = self._find_matching_mask_pairs(left_detections, right_detections, similarity_threshold)
+
+            if save_debug_frames:
+                debug_path = os.path.join(segment_info['directory'], "yolo_stereo_agreement_retry.jpg")
+                title = f"Retry Attempt (Conf: {new_confidence:.2f}) - {len(matched_pairs)} Pairs"
+                self._save_stereo_agreement_debug_frame(left_eye_frame, right_eye_frame, left_detections, right_detections, matched_pairs, unmatched_left, unmatched_right, debug_path, title)
+
+        # Prepare final results - convert to full-frame coordinates and masks
+        final_prompts = []
+        if matched_pairs:
+            logger.info(f"Found {len(matched_pairs)} valid stereo pairs.")
+            for i, pair in enumerate(matched_pairs):
+                # Convert eye-specific coordinates and masks to full-frame
+                left_bbox_full_frame, left_mask_full_frame = self._convert_eye_to_full_frame(
+                    pair['left_mask']['bbox'], pair['left_mask']['mask'], 
+                    'left', frame_width, frame_height
+                )
+                
+                right_bbox_full_frame, right_mask_full_frame = self._convert_eye_to_full_frame(
+                    pair['right_mask']['bbox'], pair['right_mask']['mask'], 
+                    'right', frame_width, frame_height
+                )
+                
+                logger.info(f"Stereo Pair {i}: Left bbox {pair['left_mask']['bbox']} -> {left_bbox_full_frame}")
+                logger.info(f"Stereo Pair {i}: Right bbox {pair['right_mask']['bbox']} -> {right_bbox_full_frame}")
+                
+                # Create prompts for SAM2 with full-frame coordinates and masks
+                final_prompts.append({
+                    'obj_id': i * 2 + 1, 
+                    'bbox': left_bbox_full_frame, 
+                    'mask': left_mask_full_frame
+                })
+                final_prompts.append({
+                    'obj_id': i * 2 + 2, 
+                    'bbox': right_bbox_full_frame, 
+                    'mask': right_mask_full_frame
+                })
+        else:
+            logger.warning("No valid stereo pairs found after all attempts.")
+            
+        return final_prompts
+    
+    def _convert_eye_to_full_frame(self, eye_bbox: np.ndarray, eye_mask: np.ndarray, 
+                                 eye_side: str, full_frame_width: int, full_frame_height: int) -> tuple:
+        """
+        Convert eye-specific bounding box and mask to full-frame coordinates.
+        
+        Args:
+            eye_bbox: Bounding box in eye coordinate system
+            eye_mask: Mask in eye coordinate system  
+            eye_side: 'left' or 'right'
+            full_frame_width: Width of the full VR180 frame
+            full_frame_height: Height of the full VR180 frame
+            
+        Returns:
+            Tuple of (full_frame_bbox, full_frame_mask)
+        """
+        half_width = full_frame_width // 2
+        
+        # Convert bounding box coordinates
+        full_frame_bbox = eye_bbox.copy()
+        
+        if eye_side == 'right':
+            # Shift right eye coordinates by half_width
+            full_frame_bbox[0] += half_width  # x1
+            full_frame_bbox[2] += half_width  # x2
+        
+        # Create full-frame mask
+        full_frame_mask = np.zeros((full_frame_height, full_frame_width), dtype=eye_mask.dtype)
+        
+        if eye_side == 'left':
+            # Place left eye mask in left half
+            eye_height, eye_width = eye_mask.shape
+            target_height = min(eye_height, full_frame_height)
+            target_width = min(eye_width, half_width)
+            full_frame_mask[:target_height, :target_width] = eye_mask[:target_height, :target_width]
+        else:  # right
+            # Place right eye mask in right half
+            eye_height, eye_width = eye_mask.shape
+            target_height = min(eye_height, full_frame_height)
+            target_width = min(eye_width, half_width)
+            full_frame_mask[:target_height, half_width:half_width+target_width] = eye_mask[:target_height, :target_width]
+        
+        logger.debug(f"Converted {eye_side} eye: bbox {eye_bbox} -> {full_frame_bbox}, "
+                    f"mask {eye_mask.shape} -> {full_frame_mask.shape}, "
+                    f"mask_pixels: {np.sum(eye_mask > 0.5)} -> {np.sum(full_frame_mask > 0.5)}")
+        
+        return full_frame_bbox, full_frame_mask
+    
+    def _validate_masks_with_detection(self, frame: np.ndarray, segmentation_detections: List[Dict[str, Any]], 
+                                     confidence_override: Optional[float] = None) -> List[Dict[str, Any]]:
+        """
+        Validate segmentation masks by checking if they overlap with detection bounding boxes.
+        This helps filter out spurious mask regions that aren't actually humans.
+        """
+        if not hasattr(self, '_detection_model'):
+            # Load detection model for validation
+            try:
+                detection_model_path = self.model_path.replace('-seg.pt', '.pt')  # Try to find detection version
+                if not os.path.exists(detection_model_path):
+                    detection_model_path = "yolo11l.pt"  # Fallback to default
+                
+                logger.info(f"Loading detection model for validation: {detection_model_path}")
+                self._detection_model = YOLO(detection_model_path)
+            except Exception as e:
+                logger.warning(f"Could not load detection model for validation: {e}")
+                return segmentation_detections
+        
+        # Run detection model
+        confidence = confidence_override if confidence_override is not None else self.confidence_threshold
+        detection_results = self._detection_model(frame, conf=confidence, verbose=False)
+        
+        # Extract detection bounding boxes
+        detection_bboxes = []
+        for result in detection_results:
+            if result.boxes is not None:
+                for box in result.boxes:
+                    cls = int(box.cls.cpu().numpy()[0])
+                    if cls == self.human_class_id:
+                        coords = box.xyxy[0].cpu().numpy()
+                        conf = float(box.conf.cpu().numpy()[0])
+                        detection_bboxes.append({'bbox': coords, 'confidence': conf})
+        
+        logger.info(f"Validation: Found {len(detection_bboxes)} detection bboxes vs {len(segmentation_detections)} segmentation masks")
+        
+        # Validate each segmentation mask against detection bboxes
+        validated_detections = []
+        for seg_det in segmentation_detections:
+            if not seg_det['has_mask']:
+                validated_detections.append(seg_det)
+                continue
+            
+            # Check if this mask overlaps significantly with any detection bbox
+            mask = seg_det['mask']
+            seg_bbox = seg_det['bbox']
+            
+            best_overlap = 0.0
+            best_detection = None
+            
+            for det_bbox_info in detection_bboxes:
+                det_bbox = det_bbox_info['bbox']
+                overlap = self._calculate_bbox_overlap(seg_bbox, det_bbox)
+                if overlap > best_overlap:
+                    best_overlap = overlap
+                    best_detection = det_bbox_info
+            
+            if best_overlap > 0.3:  # 30% overlap threshold
+                logger.info(f"Validation: Segmentation mask validated (overlap={best_overlap:.3f} with detection conf={best_detection['confidence']:.3f})")
+                validated_detections.append(seg_det)
+            else:
+                mask_area = np.sum(mask > 0.5)
+                logger.warning(f"Validation: Rejecting segmentation mask with low overlap ({best_overlap:.3f}) - area={mask_area}px")
+        
+        logger.info(f"Validation: Kept {len(validated_detections)}/{len(segmentation_detections)} segmentation masks")
+        return validated_detections
+    
+    def _calculate_bbox_overlap(self, bbox1: np.ndarray, bbox2: np.ndarray) -> float:
+        """Calculate the overlap ratio between two bounding boxes."""
+        # Calculate intersection
+        x1 = max(bbox1[0], bbox2[0])
+        y1 = max(bbox1[1], bbox2[1])
+        x2 = min(bbox1[2], bbox2[2])
+        y2 = min(bbox1[3], bbox2[3])
+        
+        if x2 <= x1 or y2 <= y1:
+            return 0.0
+        
+        intersection = (x2 - x1) * (y2 - y1)
+        
+        # Calculate areas
+        area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
+        area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+        
+        # Return intersection over smaller area (more lenient than IoU)
+        return intersection / min(area1, area2) if min(area1, area2) > 0 else 0.0