stereo mask working

This commit is contained in:
2025-07-31 11:13:31 -07:00
parent 0057017ac4
commit b97a3752a7
8 changed files with 1247 additions and 206 deletions

View File

@@ -61,26 +61,36 @@ class YOLODetector:
logger.error(f"Failed to load YOLO model: {e}")
raise
def detect_humans_in_frame(self, frame: np.ndarray) -> List[Dict[str, Any]]:
def detect_humans_in_frame(self, frame: np.ndarray, confidence_override: Optional[float] = None,
validate_with_detection: bool = False) -> List[Dict[str, Any]]:
"""
Detect humans in a single frame using YOLO.
Args:
frame: Input frame (BGR format from OpenCV)
confidence_override: Optional confidence to use instead of the default
validate_with_detection: If True and in segmentation mode, validate masks against detection bboxes
Returns:
List of human detection dictionaries with bbox, confidence, and optionally masks
"""
# Run YOLO detection/segmentation
results = self.model(frame, conf=self.confidence_threshold, verbose=False)
confidence = confidence_override if confidence_override is not None else self.confidence_threshold
results = self.model(frame, conf=confidence, verbose=False)
human_detections = []
# Process results
for result in results:
for result_idx, result in enumerate(results):
boxes = result.boxes
masks = result.masks if hasattr(result, 'masks') and result.masks is not None else None
logger.debug(f"YOLO Result {result_idx}: boxes={boxes is not None}, masks={masks is not None}")
if boxes is not None:
logger.debug(f" Found {len(boxes)} total boxes")
if masks is not None:
logger.debug(f" Found {len(masks.data)} total masks")
if boxes is not None:
for i, box in enumerate(boxes):
# Get class ID
@@ -101,18 +111,30 @@ class YOLODetector:
# Extract mask if available (segmentation mode)
if masks is not None and i < len(masks.data):
mask_data = masks.data[i].cpu().numpy() # Get mask for this detection
# Resize the raw mask to match the input frame dimensions
raw_mask = masks.data[i].cpu().numpy()
resized_mask = cv2.resize(raw_mask, (frame.shape[1], frame.shape[0]), interpolation=cv2.INTER_NEAREST)
mask_area = np.sum(resized_mask > 0.5)
detection['has_mask'] = True
detection['mask'] = mask_data
logger.debug(f"YOLO Segmentation: Detected human with mask - conf={conf:.2f}, mask_shape={mask_data.shape}")
detection['mask'] = resized_mask
logger.info(f"YOLO Segmentation: Human {len(human_detections)} - conf={conf:.3f}, raw_mask_shape={raw_mask.shape}, frame_shape={frame.shape}, resized_mask_shape={resized_mask.shape}, mask_area={mask_area}px")
else:
logger.debug(f"YOLO Detection: Detected human with bbox - conf={conf:.2f}, bbox={coords}")
logger.debug(f"YOLO Detection: Human {len(human_detections)} - conf={conf:.3f}, bbox={coords} (no mask)")
human_detections.append(detection)
else:
logger.debug(f"YOLO: Skipping non-human detection (class {cls})")
if self.supports_segmentation:
masks_found = sum(1 for d in human_detections if d['has_mask'])
logger.info(f"YOLO Segmentation: Found {len(human_detections)} humans, {masks_found} with masks")
# Optional validation with detection model
if validate_with_detection and masks_found > 0:
logger.info("Validating segmentation masks with detection model...")
validated_detections = self._validate_masks_with_detection(frame, human_detections, confidence_override)
return validated_detections
else:
logger.debug(f"YOLO Detection: Found {len(human_detections)} humans with bounding boxes")
@@ -1028,4 +1050,508 @@ class YOLODetector:
except Exception as e:
logger.error(f"Error creating {eye_side} eye debug frame: {e}")
return False
return False
def _calculate_iou(self, mask1: np.ndarray, mask2: np.ndarray) -> float:
"""Calculate Intersection over Union for two masks of the same size."""
if mask1.shape != mask2.shape:
return 0.0
intersection = np.logical_and(mask1, mask2).sum()
union = np.logical_or(mask1, mask2).sum()
return intersection / union if union > 0 else 0.0
def _calculate_stereo_similarity(self, left_mask: np.ndarray, right_mask: np.ndarray,
left_bbox: np.ndarray, right_bbox: np.ndarray,
left_idx: int = -1, right_idx: int = -1) -> float:
"""
Calculate stereo similarity for VR180 masks using spatial and size features.
For VR180, left and right eye views won't overlap much, so we use other metrics.
"""
logger.info(f" Starting similarity calculation L{left_idx} vs R{right_idx}")
logger.info(f" Left mask: shape={left_mask.shape}, dtype={left_mask.dtype}, min={left_mask.min()}, max={left_mask.max()}")
logger.info(f" Right mask: shape={right_mask.shape}, dtype={right_mask.dtype}, min={right_mask.min()}, max={right_mask.max()}")
logger.info(f" Left bbox: {left_bbox}")
logger.info(f" Right bbox: {right_bbox}")
if left_mask.shape != right_mask.shape:
logger.info(f" L{left_idx} vs R{right_idx}: Shape mismatch - {left_mask.shape} vs {right_mask.shape} - attempting to resize")
# Try to resize the smaller mask to match the larger one
if left_mask.size < right_mask.size:
left_mask = cv2.resize(left_mask.astype(np.float32), (right_mask.shape[1], right_mask.shape[0]), interpolation=cv2.INTER_NEAREST)
left_mask = left_mask > 0.5
logger.info(f" Resized left mask to {left_mask.shape}")
else:
right_mask = cv2.resize(right_mask.astype(np.float32), (left_mask.shape[1], left_mask.shape[0]), interpolation=cv2.INTER_NEAREST)
right_mask = right_mask > 0.5
logger.info(f" Resized right mask to {right_mask.shape}")
if left_mask.shape != right_mask.shape:
logger.warning(f" L{left_idx} vs R{right_idx}: Still shape mismatch after resize - {left_mask.shape} vs {right_mask.shape}")
return 0.0
# 1. Size similarity (area ratio)
left_area = np.sum(left_mask)
right_area = np.sum(right_mask)
if left_area == 0 or right_area == 0:
logger.debug(f" L{left_idx} vs R{right_idx}: Zero area - left={left_area}, right={right_area}")
return 0.0
area_ratio = min(left_area, right_area) / max(left_area, right_area)
# 2. Vertical position similarity (y-coordinates should be similar)
left_center_y = (left_bbox[1] + left_bbox[3]) / 2
right_center_y = (right_bbox[1] + right_bbox[3]) / 2
height = left_mask.shape[0]
y_diff = abs(left_center_y - right_center_y) / height
y_similarity = max(0, 1.0 - y_diff * 2) # Penalize vertical misalignment
# 3. Height similarity (bounding box heights should be similar)
left_height = left_bbox[3] - left_bbox[1]
right_height = right_bbox[3] - right_bbox[1]
if left_height == 0 or right_height == 0:
height_ratio = 0.0
else:
height_ratio = min(left_height, right_height) / max(left_height, right_height)
# 4. Aspect ratio similarity
left_width = left_bbox[2] - left_bbox[0]
right_width = right_bbox[2] - right_bbox[0]
if left_width == 0 or right_width == 0 or left_height == 0 or right_height == 0:
aspect_similarity = 0.0
else:
left_aspect = left_width / left_height
right_aspect = right_width / right_height
aspect_diff = abs(left_aspect - right_aspect) / max(left_aspect, right_aspect)
aspect_similarity = max(0, 1.0 - aspect_diff)
# Combine metrics with weights
similarity = (
area_ratio * 0.3 + # 30% weight on size similarity
y_similarity * 0.4 + # 40% weight on vertical alignment
height_ratio * 0.2 + # 20% weight on height similarity
aspect_similarity * 0.1 # 10% weight on aspect ratio
)
# Detailed logging for each comparison
logger.info(f" L{left_idx} vs R{right_idx}: area_ratio={area_ratio:.3f} (L={left_area}px, R={right_area}px), "
f"y_sim={y_similarity:.3f} (L_y={left_center_y:.1f}, R_y={right_center_y:.1f}, diff={y_diff:.3f}), "
f"height_ratio={height_ratio:.3f} (L_h={left_height:.1f}, R_h={right_height:.1f}), "
f"aspect_sim={aspect_similarity:.3f} (L_asp={left_aspect:.2f}, R_asp={right_aspect:.2f}), "
f"FINAL_SIMILARITY={similarity:.3f}")
return similarity
def _find_matching_mask_pairs(self, left_masks: List[Dict[str, Any]], right_masks: List[Dict[str, Any]],
similarity_threshold: float) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Find the best matching pairs of masks between left and right eyes using stereo similarity."""
logger.info(f"Starting stereo mask matching with {len(left_masks)} left masks and {len(right_masks)} right masks.")
if not left_masks or not right_masks:
return [], left_masks, right_masks
# 1. Calculate all similarity scores for every possible pair
possible_pairs = []
logger.info("--- Calculating all possible stereo similarity pairs ---")
# First, log details about each mask
logger.info(f"LEFT EYE MASKS ({len(left_masks)} total):")
for i, left_detection in enumerate(left_masks):
bbox = left_detection['bbox']
mask_area = np.sum(left_detection['mask'])
conf = left_detection['confidence']
logger.info(f" L{i}: bbox=[{bbox[0]:.1f},{bbox[1]:.1f},{bbox[2]:.1f},{bbox[3]:.1f}], area={mask_area}px, conf={conf:.3f}")
logger.info(f"RIGHT EYE MASKS ({len(right_masks)} total):")
for j, right_detection in enumerate(right_masks):
bbox = right_detection['bbox']
mask_area = np.sum(right_detection['mask'])
conf = right_detection['confidence']
logger.info(f" R{j}: bbox=[{bbox[0]:.1f},{bbox[1]:.1f},{bbox[2]:.1f},{bbox[3]:.1f}], area={mask_area}px, conf={conf:.3f}")
logger.info("--- Stereo Similarity Calculations ---")
for i, left_detection in enumerate(left_masks):
for j, right_detection in enumerate(right_masks):
try:
# Use stereo similarity instead of IOU for VR180
similarity = self._calculate_stereo_similarity(
left_detection['mask'], right_detection['mask'],
left_detection['bbox'], right_detection['bbox'],
left_idx=i, right_idx=j
)
if similarity > similarity_threshold:
possible_pairs.append({'left_idx': i, 'right_idx': j, 'similarity': similarity})
logger.info(f" ✓ L{i} vs R{j}: ABOVE THRESHOLD ({similarity:.4f} > {similarity_threshold:.4f})")
else:
logger.info(f" ✗ L{i} vs R{j}: BELOW THRESHOLD ({similarity:.4f} <= {similarity_threshold:.4f})")
except Exception as e:
logger.error(f" ERROR L{i} vs R{j}: Exception in similarity calculation: {e}")
similarity = 0.0
# 2. Sort pairs by similarity score in descending order to prioritize the best matches
possible_pairs.sort(key=lambda x: x['similarity'], reverse=True)
logger.debug("--- Sorted similarity pairs above threshold ---")
for pair in possible_pairs:
logger.debug(f" Pair (L{pair['left_idx']}, R{pair['right_idx']}) - Similarity: {pair['similarity']:.4f}")
matched_pairs = []
matched_left_indices = set()
matched_right_indices = set()
# 3. Iterate through sorted pairs and greedily select the best available ones
logger.debug("--- Selecting best pairs ---")
for pair in possible_pairs:
left_idx, right_idx = pair['left_idx'], pair['right_idx']
if left_idx not in matched_left_indices and right_idx not in matched_right_indices:
logger.info(f" MATCH FOUND: (L{left_idx}, R{right_idx}) with Similarity {pair['similarity']:.4f}")
matched_pairs.append({
'left_mask': left_masks[left_idx],
'right_mask': right_masks[right_idx],
'similarity': pair['similarity'] # Changed from 'iou' to 'similarity'
})
matched_left_indices.add(left_idx)
matched_right_indices.add(right_idx)
else:
logger.debug(f" Skipping pair (L{left_idx}, R{right_idx}) because one mask is already matched.")
# 4. Identify unmatched (orphan) masks
unmatched_left = [mask for i, mask in enumerate(left_masks) if i not in matched_left_indices]
unmatched_right = [mask for i, mask in enumerate(right_masks) if i not in matched_right_indices]
logger.info(f"Matching complete: Found {len(matched_pairs)} pairs. Left orphans: {len(unmatched_left)}, Right orphans: {len(unmatched_right)}.")
return matched_pairs, unmatched_left, unmatched_right
def _save_stereo_agreement_debug_frame(self, left_frame: np.ndarray, right_frame: np.ndarray,
left_detections: List[Dict[str, Any]], right_detections: List[Dict[str, Any]],
matched_pairs: List[Dict[str, Any]], unmatched_left: List[Dict[str, Any]],
unmatched_right: List[Dict[str, Any]], output_path: str, title: str):
"""Save a debug frame visualizing the stereo mask agreement process."""
try:
# Create a combined image
h, w, _ = left_frame.shape
combined_frame = np.hstack((left_frame, right_frame))
def get_centroid(mask):
m = cv2.moments(mask.astype(np.uint8), binaryImage=True)
return (int(m["m10"] / m["m00"]), int(m["m01"] / m["m00"])) if m["m00"] != 0 else (0,0)
def draw_label(frame, text, pos, color):
# Draw a black background rectangle
cv2.rectangle(frame, (pos[0], pos[1] - 14), (pos[0] + len(text) * 8, pos[1] + 5), (0,0,0), -1)
# Draw the text
cv2.putText(frame, text, pos, cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
# --- Draw ALL Masks First (to ensure every mask gets a label) ---
logger.info(f"Debug Frame: Drawing {len(left_detections)} left masks and {len(right_detections)} right masks")
# Draw all left detections first
for i, detection in enumerate(left_detections):
mask = detection['mask']
mask_area = np.sum(mask > 0.5)
# Skip tiny masks that are likely noise
if mask_area < 100: # Less than 100 pixels
logger.debug(f"Skipping tiny left mask L{i} with area {mask_area}px")
continue
contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if contours:
cv2.drawContours(combined_frame, contours, -1, (0, 0, 255), 2) # Default red for unmatched
c = get_centroid(mask)
if c[0] > 0 and c[1] > 0: # Valid centroid
draw_label(combined_frame, f"L{i}", c, (0, 0, 255))
logger.debug(f"Drew left mask L{i} at centroid {c}, area={mask_area}px")
# Draw all right detections
for i, detection in enumerate(right_detections):
mask = detection['mask']
mask_area = np.sum(mask > 0.5)
# Skip tiny masks that are likely noise
if mask_area < 100: # Less than 100 pixels
logger.debug(f"Skipping tiny right mask R{i} with area {mask_area}px")
continue
contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if contours:
for cnt in contours:
cnt[:, :, 0] += w
cv2.drawContours(combined_frame, contours, -1, (0, 0, 255), 2) # Default red for unmatched
c_shifted = get_centroid(mask)
c = (c_shifted[0] + w, c_shifted[1])
if c[0] > w and c[1] > 0: # Valid centroid in right half
draw_label(combined_frame, f"R{i}", c, (0, 0, 255))
logger.debug(f"Drew right mask R{i} at centroid {c}, area={mask_area}px")
# --- Now Overdraw Matched Pairs in Green ---
for pair in matched_pairs:
left_mask = pair['left_mask']['mask']
right_mask = pair['right_mask']['mask']
# Find the indices from the stored pair data (should be available from matching)
left_idx = None
right_idx = None
# Find indices by comparing mask properties
for i, det in enumerate(left_detections):
if (np.array_equal(det['bbox'], pair['left_mask']['bbox']) and
abs(det['confidence'] - pair['left_mask']['confidence']) < 0.001):
left_idx = i
break
for i, det in enumerate(right_detections):
if (np.array_equal(det['bbox'], pair['right_mask']['bbox']) and
abs(det['confidence'] - pair['right_mask']['confidence']) < 0.001):
right_idx = i
break
# Draw left mask in green (matched)
contours, _ = cv2.findContours(left_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if contours:
cv2.drawContours(combined_frame, contours, -1, (0, 255, 0), 3) # Thicker green line
c1 = get_centroid(left_mask)
if c1[0] > 0 and c1[1] > 0:
draw_label(combined_frame, f"L{left_idx if left_idx is not None else '?'}", c1, (0, 255, 0))
# Draw right mask in green (matched)
contours, _ = cv2.findContours(right_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if contours:
for cnt in contours:
cnt[:, :, 0] += w
cv2.drawContours(combined_frame, contours, -1, (0, 255, 0), 3) # Thicker green line
c2_shifted = get_centroid(right_mask)
c2 = (c2_shifted[0] + w, c2_shifted[1])
if c2[0] > w and c2[1] > 0:
draw_label(combined_frame, f"R{right_idx if right_idx is not None else '?'}", c2, (0, 255, 0))
# Draw line connecting centroids and similarity score
cv2.line(combined_frame, c1, c2, (0, 255, 0), 2)
similarity_text = f"Sim: {pair.get('similarity', pair.get('iou', 0)):.2f}"
cv2.putText(combined_frame, similarity_text, (c1[0] + 10, c1[1] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
# Add title
cv2.putText(combined_frame, title, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
cv2.imwrite(output_path, combined_frame)
logger.info(f"Saved stereo agreement debug frame to {output_path}")
except Exception as e:
logger.error(f"Failed to create stereo agreement debug frame: {e}")
def detect_and_match_stereo_pairs(self, frame: np.ndarray, confidence_reduction_factor: float,
stereo_similarity_threshold: float, segment_info: dict, save_debug_frames: bool) -> List[Dict[str, Any]]:
"""The main method to detect and match stereo mask pairs."""
frame_height, frame_width, _ = frame.shape
half_width = frame_width // 2
left_eye_frame = frame[:, :half_width]
right_eye_frame = frame[:, half_width:half_width*2] # Ensure exact same width
logger.info(f"VR180 Frame Split: Original={frame.shape}, Left={left_eye_frame.shape}, Right={right_eye_frame.shape}")
# Initial detection with validation
logger.info(f"Running initial stereo detection at {self.confidence_threshold} confidence.")
left_detections = self.detect_humans_in_frame(left_eye_frame, validate_with_detection=True)
right_detections = self.detect_humans_in_frame(right_eye_frame, validate_with_detection=True)
# Convert IOU threshold to similarity threshold (IOU 0.5 ≈ similarity 0.3)
similarity_threshold = max(0.2, stereo_similarity_threshold * 0.6)
matched_pairs, unmatched_left, unmatched_right = self._find_matching_mask_pairs(left_detections, right_detections, similarity_threshold)
if save_debug_frames:
debug_path = os.path.join(segment_info['directory'], "yolo_stereo_agreement_initial.jpg")
title = f"Initial Attempt (Conf: {self.confidence_threshold:.2f}) - {len(matched_pairs)} Pairs"
self._save_stereo_agreement_debug_frame(left_eye_frame, right_eye_frame, left_detections, right_detections, matched_pairs, unmatched_left, unmatched_right, debug_path, title)
# Retry with lower confidence if no pairs found
if not matched_pairs:
new_confidence = self.confidence_threshold * confidence_reduction_factor
logger.info(f"No valid pairs found. Reducing confidence to {new_confidence:.2f} and retrying.")
left_detections = self.detect_humans_in_frame(left_eye_frame, confidence_override=new_confidence, validate_with_detection=True)
right_detections = self.detect_humans_in_frame(right_eye_frame, confidence_override=new_confidence, validate_with_detection=True)
matched_pairs, unmatched_left, unmatched_right = self._find_matching_mask_pairs(left_detections, right_detections, similarity_threshold)
if save_debug_frames:
debug_path = os.path.join(segment_info['directory'], "yolo_stereo_agreement_retry.jpg")
title = f"Retry Attempt (Conf: {new_confidence:.2f}) - {len(matched_pairs)} Pairs"
self._save_stereo_agreement_debug_frame(left_eye_frame, right_eye_frame, left_detections, right_detections, matched_pairs, unmatched_left, unmatched_right, debug_path, title)
# Prepare final results - convert to full-frame coordinates and masks
final_prompts = []
if matched_pairs:
logger.info(f"Found {len(matched_pairs)} valid stereo pairs.")
for i, pair in enumerate(matched_pairs):
# Convert eye-specific coordinates and masks to full-frame
left_bbox_full_frame, left_mask_full_frame = self._convert_eye_to_full_frame(
pair['left_mask']['bbox'], pair['left_mask']['mask'],
'left', frame_width, frame_height
)
right_bbox_full_frame, right_mask_full_frame = self._convert_eye_to_full_frame(
pair['right_mask']['bbox'], pair['right_mask']['mask'],
'right', frame_width, frame_height
)
logger.info(f"Stereo Pair {i}: Left bbox {pair['left_mask']['bbox']} -> {left_bbox_full_frame}")
logger.info(f"Stereo Pair {i}: Right bbox {pair['right_mask']['bbox']} -> {right_bbox_full_frame}")
# Create prompts for SAM2 with full-frame coordinates and masks
final_prompts.append({
'obj_id': i * 2 + 1,
'bbox': left_bbox_full_frame,
'mask': left_mask_full_frame
})
final_prompts.append({
'obj_id': i * 2 + 2,
'bbox': right_bbox_full_frame,
'mask': right_mask_full_frame
})
else:
logger.warning("No valid stereo pairs found after all attempts.")
return final_prompts
def _convert_eye_to_full_frame(self, eye_bbox: np.ndarray, eye_mask: np.ndarray,
eye_side: str, full_frame_width: int, full_frame_height: int) -> tuple:
"""
Convert eye-specific bounding box and mask to full-frame coordinates.
Args:
eye_bbox: Bounding box in eye coordinate system
eye_mask: Mask in eye coordinate system
eye_side: 'left' or 'right'
full_frame_width: Width of the full VR180 frame
full_frame_height: Height of the full VR180 frame
Returns:
Tuple of (full_frame_bbox, full_frame_mask)
"""
half_width = full_frame_width // 2
# Convert bounding box coordinates
full_frame_bbox = eye_bbox.copy()
if eye_side == 'right':
# Shift right eye coordinates by half_width
full_frame_bbox[0] += half_width # x1
full_frame_bbox[2] += half_width # x2
# Create full-frame mask
full_frame_mask = np.zeros((full_frame_height, full_frame_width), dtype=eye_mask.dtype)
if eye_side == 'left':
# Place left eye mask in left half
eye_height, eye_width = eye_mask.shape
target_height = min(eye_height, full_frame_height)
target_width = min(eye_width, half_width)
full_frame_mask[:target_height, :target_width] = eye_mask[:target_height, :target_width]
else: # right
# Place right eye mask in right half
eye_height, eye_width = eye_mask.shape
target_height = min(eye_height, full_frame_height)
target_width = min(eye_width, half_width)
full_frame_mask[:target_height, half_width:half_width+target_width] = eye_mask[:target_height, :target_width]
logger.debug(f"Converted {eye_side} eye: bbox {eye_bbox} -> {full_frame_bbox}, "
f"mask {eye_mask.shape} -> {full_frame_mask.shape}, "
f"mask_pixels: {np.sum(eye_mask > 0.5)} -> {np.sum(full_frame_mask > 0.5)}")
return full_frame_bbox, full_frame_mask
def _validate_masks_with_detection(self, frame: np.ndarray, segmentation_detections: List[Dict[str, Any]],
confidence_override: Optional[float] = None) -> List[Dict[str, Any]]:
"""
Validate segmentation masks by checking if they overlap with detection bounding boxes.
This helps filter out spurious mask regions that aren't actually humans.
"""
if not hasattr(self, '_detection_model'):
# Load detection model for validation
try:
detection_model_path = self.model_path.replace('-seg.pt', '.pt') # Try to find detection version
if not os.path.exists(detection_model_path):
detection_model_path = "yolo11l.pt" # Fallback to default
logger.info(f"Loading detection model for validation: {detection_model_path}")
self._detection_model = YOLO(detection_model_path)
except Exception as e:
logger.warning(f"Could not load detection model for validation: {e}")
return segmentation_detections
# Run detection model
confidence = confidence_override if confidence_override is not None else self.confidence_threshold
detection_results = self._detection_model(frame, conf=confidence, verbose=False)
# Extract detection bounding boxes
detection_bboxes = []
for result in detection_results:
if result.boxes is not None:
for box in result.boxes:
cls = int(box.cls.cpu().numpy()[0])
if cls == self.human_class_id:
coords = box.xyxy[0].cpu().numpy()
conf = float(box.conf.cpu().numpy()[0])
detection_bboxes.append({'bbox': coords, 'confidence': conf})
logger.info(f"Validation: Found {len(detection_bboxes)} detection bboxes vs {len(segmentation_detections)} segmentation masks")
# Validate each segmentation mask against detection bboxes
validated_detections = []
for seg_det in segmentation_detections:
if not seg_det['has_mask']:
validated_detections.append(seg_det)
continue
# Check if this mask overlaps significantly with any detection bbox
mask = seg_det['mask']
seg_bbox = seg_det['bbox']
best_overlap = 0.0
best_detection = None
for det_bbox_info in detection_bboxes:
det_bbox = det_bbox_info['bbox']
overlap = self._calculate_bbox_overlap(seg_bbox, det_bbox)
if overlap > best_overlap:
best_overlap = overlap
best_detection = det_bbox_info
if best_overlap > 0.3: # 30% overlap threshold
logger.info(f"Validation: Segmentation mask validated (overlap={best_overlap:.3f} with detection conf={best_detection['confidence']:.3f})")
validated_detections.append(seg_det)
else:
mask_area = np.sum(mask > 0.5)
logger.warning(f"Validation: Rejecting segmentation mask with low overlap ({best_overlap:.3f}) - area={mask_area}px")
logger.info(f"Validation: Kept {len(validated_detections)}/{len(segmentation_detections)} segmentation masks")
return validated_detections
def _calculate_bbox_overlap(self, bbox1: np.ndarray, bbox2: np.ndarray) -> float:
"""Calculate the overlap ratio between two bounding boxes."""
# Calculate intersection
x1 = max(bbox1[0], bbox2[0])
y1 = max(bbox1[1], bbox2[1])
x2 = min(bbox1[2], bbox2[2])
y2 = min(bbox1[3], bbox2[3])
if x2 <= x1 or y2 <= y1:
return 0.0
intersection = (x2 - x1) * (y2 - y1)
# Calculate areas
area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
# Return intersection over smaller area (more lenient than IoU)
return intersection / min(area1, area2) if min(area1, area2) > 0 else 0.0