working with segemntation
This commit is contained in:
@@ -50,11 +50,31 @@ class ConfigLoader:
|
||||
raise ValueError(f"Missing required field: output.{field}")
|
||||
|
||||
# Validate models section
|
||||
required_model_fields = ['yolo_model', 'sam2_checkpoint', 'sam2_config']
|
||||
required_model_fields = ['sam2_checkpoint', 'sam2_config']
|
||||
for field in required_model_fields:
|
||||
if field not in self.config['models']:
|
||||
raise ValueError(f"Missing required field: models.{field}")
|
||||
|
||||
# Validate YOLO model configuration
|
||||
yolo_mode = self.config['models'].get('yolo_mode', 'detection')
|
||||
if yolo_mode not in ['detection', 'segmentation']:
|
||||
raise ValueError(f"Invalid yolo_mode: {yolo_mode}. Must be 'detection' or 'segmentation'")
|
||||
|
||||
# Check for legacy yolo_model field vs new structure
|
||||
has_legacy_yolo_model = 'yolo_model' in self.config['models']
|
||||
has_new_yolo_models = 'yolo_detection_model' in self.config['models'] or 'yolo_segmentation_model' in self.config['models']
|
||||
|
||||
if not has_legacy_yolo_model and not has_new_yolo_models:
|
||||
raise ValueError("Missing YOLO model configuration. Provide either 'yolo_model' (legacy) or 'yolo_detection_model'/'yolo_segmentation_model' (new)")
|
||||
|
||||
# Validate that the required model for the current mode exists
|
||||
if yolo_mode == 'detection':
|
||||
if has_new_yolo_models and 'yolo_detection_model' not in self.config['models']:
|
||||
raise ValueError("yolo_mode is 'detection' but yolo_detection_model not specified")
|
||||
elif yolo_mode == 'segmentation':
|
||||
if has_new_yolo_models and 'yolo_segmentation_model' not in self.config['models']:
|
||||
raise ValueError("yolo_mode is 'segmentation' but yolo_segmentation_model not specified")
|
||||
|
||||
# Validate processing.detect_segments format
|
||||
detect_segments = self.config['processing'].get('detect_segments', 'all')
|
||||
if not isinstance(detect_segments, (str, list)):
|
||||
@@ -114,8 +134,17 @@ class ConfigLoader:
|
||||
return self.config['processing'].get('detect_segments', 'all')
|
||||
|
||||
def get_yolo_model_path(self) -> str:
|
||||
"""Get YOLO model path."""
|
||||
return self.config['models']['yolo_model']
|
||||
"""Get YOLO model path (legacy method for backward compatibility)."""
|
||||
# Check for legacy configuration first
|
||||
if 'yolo_model' in self.config['models']:
|
||||
return self.config['models']['yolo_model']
|
||||
|
||||
# Use new configuration based on mode
|
||||
yolo_mode = self.config['models'].get('yolo_mode', 'detection')
|
||||
if yolo_mode == 'detection':
|
||||
return self.config['models'].get('yolo_detection_model', 'yolov8n.pt')
|
||||
else: # segmentation mode
|
||||
return self.config['models'].get('yolo_segmentation_model', 'yolov8n-seg.pt')
|
||||
|
||||
def get_sam2_checkpoint(self) -> str:
|
||||
"""Get SAM2 checkpoint path."""
|
||||
|
||||
@@ -47,8 +47,23 @@ class SAM2Processor:
|
||||
logger.info(f"Using device: {device}")
|
||||
|
||||
try:
|
||||
# Extract just the config filename for SAM2's Hydra-based loader
|
||||
# SAM2 expects a config name relative to its internal config directory
|
||||
config_name = os.path.basename(self.config_path)
|
||||
if config_name.endswith('.yaml'):
|
||||
config_name = config_name[:-5] # Remove .yaml extension
|
||||
|
||||
# SAM2 configs are in the format "sam2.1_hiera_X.yaml"
|
||||
# and should be referenced as "configs/sam2.1/sam2.1_hiera_X"
|
||||
if config_name.startswith("sam2.1_hiera"):
|
||||
config_name = f"configs/sam2.1/{config_name}"
|
||||
elif config_name.startswith("sam2_hiera"):
|
||||
config_name = f"configs/sam2/{config_name}"
|
||||
|
||||
logger.info(f"Using SAM2 config: {config_name}")
|
||||
|
||||
self.predictor = build_sam2_video_predictor(
|
||||
self.config_path,
|
||||
config_name, # Use just the config name, not full path
|
||||
self.checkpoint_path,
|
||||
device=device
|
||||
)
|
||||
@@ -103,6 +118,7 @@ class SAM2Processor:
|
||||
def add_yolo_prompts_to_predictor(self, inference_state, prompts: List[Dict[str, Any]]) -> bool:
|
||||
"""
|
||||
Add YOLO detection prompts to SAM2 predictor.
|
||||
Includes error handling matching the working spec.md implementation.
|
||||
|
||||
Args:
|
||||
inference_state: SAM2 inference state
|
||||
@@ -112,14 +128,21 @@ class SAM2Processor:
|
||||
True if prompts were added successfully
|
||||
"""
|
||||
if not prompts:
|
||||
logger.warning("No prompts provided to SAM2")
|
||||
logger.warning("SAM2 Debug: No prompts provided to SAM2")
|
||||
return False
|
||||
|
||||
try:
|
||||
for prompt in prompts:
|
||||
obj_id = prompt['obj_id']
|
||||
bbox = prompt['bbox']
|
||||
|
||||
logger.info(f"SAM2 Debug: Received {len(prompts)} prompts to add to predictor")
|
||||
|
||||
success_count = 0
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
obj_id = prompt['obj_id']
|
||||
bbox = prompt['bbox']
|
||||
confidence = prompt.get('confidence', 'unknown')
|
||||
|
||||
logger.info(f"SAM2 Debug: Adding prompt {i+1}/{len(prompts)}: Object {obj_id}, bbox={bbox}, conf={confidence}")
|
||||
|
||||
try:
|
||||
_, out_obj_ids, out_mask_logits = self.predictor.add_new_points_or_box(
|
||||
inference_state=inference_state,
|
||||
frame_idx=0,
|
||||
@@ -127,13 +150,19 @@ class SAM2Processor:
|
||||
box=bbox.astype(np.float32),
|
||||
)
|
||||
|
||||
logger.debug(f"Added prompt for Object {obj_id}: {bbox}")
|
||||
|
||||
logger.info(f"Successfully added {len(prompts)} prompts to SAM2")
|
||||
logger.info(f"SAM2 Debug: ✓ Successfully added Object {obj_id} - returned obj_ids: {out_obj_ids}")
|
||||
success_count += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"SAM2 Debug: ✗ Error adding Object {obj_id}: {e}")
|
||||
# Continue processing other prompts even if one fails
|
||||
continue
|
||||
|
||||
if success_count > 0:
|
||||
logger.info(f"SAM2 Debug: Final result - {success_count}/{len(prompts)} prompts successfully added")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error adding prompts to SAM2: {e}")
|
||||
else:
|
||||
logger.error("SAM2 Debug: FAILED - No prompts were successfully added to SAM2")
|
||||
return False
|
||||
|
||||
def load_previous_segment_mask(self, prev_segment_dir: str) -> Optional[Dict[int, np.ndarray]]:
|
||||
@@ -235,15 +264,17 @@ class SAM2Processor:
|
||||
|
||||
def process_single_segment(self, segment_info: dict, yolo_prompts: Optional[List[Dict[str, Any]]] = None,
|
||||
previous_masks: Optional[Dict[int, np.ndarray]] = None,
|
||||
inference_scale: float = 0.5) -> Optional[Dict[int, Dict[int, np.ndarray]]]:
|
||||
inference_scale: float = 0.5,
|
||||
multi_frame_prompts: Optional[Dict[int, List[Dict[str, Any]]]] = None) -> Optional[Dict[int, Dict[int, np.ndarray]]]:
|
||||
"""
|
||||
Process a single video segment with SAM2.
|
||||
|
||||
Args:
|
||||
segment_info: Segment information dictionary
|
||||
yolo_prompts: Optional YOLO detection prompts
|
||||
yolo_prompts: Optional YOLO detection prompts for first frame
|
||||
previous_masks: Optional masks from previous segment
|
||||
inference_scale: Scale factor for inference
|
||||
multi_frame_prompts: Optional prompts for multiple frames (mid-segment detection)
|
||||
|
||||
Returns:
|
||||
Video segments dictionary or None if failed
|
||||
@@ -284,6 +315,13 @@ class SAM2Processor:
|
||||
logger.error(f"No prompts or previous masks available for segment {segment_idx}")
|
||||
return None
|
||||
|
||||
# Add mid-segment prompts if provided
|
||||
if multi_frame_prompts:
|
||||
logger.info(f"Adding mid-segment prompts for segment {segment_idx}")
|
||||
if not self.add_multi_frame_prompts_to_predictor(inference_state, multi_frame_prompts):
|
||||
logger.warning(f"Failed to add mid-segment prompts for segment {segment_idx}")
|
||||
# Don't return None here - continue with existing prompts
|
||||
|
||||
# Propagate masks
|
||||
video_segments = self.propagate_masks(inference_state)
|
||||
|
||||
@@ -359,4 +397,198 @@ class SAM2Processor:
|
||||
logger.info(f"Saved final masks to {output_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving final masks: {e}")
|
||||
logger.error(f"Error saving final masks: {e}")
|
||||
|
||||
def generate_first_frame_debug_masks(self, video_path: str, prompts: List[Dict[str, Any]],
|
||||
output_path: str, inference_scale: float = 0.5) -> bool:
|
||||
"""
|
||||
Generate SAM2 masks for just the first frame and save debug visualization.
|
||||
This helps debug what SAM2 is producing for each detected object.
|
||||
|
||||
Args:
|
||||
video_path: Path to the video file
|
||||
prompts: List of SAM2 prompt dictionaries
|
||||
output_path: Path to save the debug image
|
||||
inference_scale: Scale factor for SAM2 inference
|
||||
|
||||
Returns:
|
||||
True if debug masks were generated successfully
|
||||
"""
|
||||
if not prompts:
|
||||
logger.warning("No prompts provided for first frame debug")
|
||||
return False
|
||||
|
||||
try:
|
||||
logger.info(f"SAM2 Debug: Generating first frame masks for {len(prompts)} objects")
|
||||
|
||||
# Load the first frame
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
ret, original_frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
if not ret:
|
||||
logger.error("Could not read first frame for debug mask generation")
|
||||
return False
|
||||
|
||||
# Scale frame for inference if needed
|
||||
if inference_scale != 1.0:
|
||||
inference_frame = cv2.resize(original_frame, None, fx=inference_scale, fy=inference_scale, interpolation=cv2.INTER_LINEAR)
|
||||
else:
|
||||
inference_frame = original_frame.copy()
|
||||
|
||||
# Create temporary low-res video with just first frame
|
||||
import tempfile
|
||||
import os
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
temp_video_path = os.path.join(temp_dir, "first_frame.mp4")
|
||||
|
||||
# Write single frame to temporary video
|
||||
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
||||
out = cv2.VideoWriter(temp_video_path, fourcc, 1.0, (inference_frame.shape[1], inference_frame.shape[0]))
|
||||
out.write(inference_frame)
|
||||
out.release()
|
||||
|
||||
# Initialize SAM2 inference state with single frame
|
||||
inference_state = self.predictor.init_state(video_path=temp_video_path, async_loading_frames=True)
|
||||
|
||||
# Add prompts
|
||||
if not self.add_yolo_prompts_to_predictor(inference_state, prompts):
|
||||
logger.error("Failed to add prompts for first frame debug")
|
||||
return False
|
||||
|
||||
# Generate masks for first frame only
|
||||
frame_masks = {}
|
||||
for out_frame_idx, out_obj_ids, out_mask_logits in self.predictor.propagate_in_video(inference_state):
|
||||
if out_frame_idx == 0: # Only process first frame
|
||||
frame_masks = {
|
||||
out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
|
||||
for i, out_obj_id in enumerate(out_obj_ids)
|
||||
}
|
||||
break
|
||||
|
||||
if not frame_masks:
|
||||
logger.error("No masks generated for first frame debug")
|
||||
return False
|
||||
|
||||
# Create debug visualization
|
||||
debug_frame = original_frame.copy()
|
||||
|
||||
# Define colors for each object
|
||||
colors = {
|
||||
1: (0, 255, 0), # Green for Object 1 (Left eye)
|
||||
2: (255, 0, 0), # Blue for Object 2 (Right eye)
|
||||
3: (0, 255, 255), # Yellow for Object 3
|
||||
4: (255, 0, 255), # Magenta for Object 4
|
||||
}
|
||||
|
||||
# Overlay masks with transparency
|
||||
for obj_id, mask in frame_masks.items():
|
||||
mask = mask.squeeze()
|
||||
|
||||
# Resize mask to match original frame if needed
|
||||
if mask.shape != original_frame.shape[:2]:
|
||||
mask = cv2.resize(mask.astype(np.float32), (original_frame.shape[1], original_frame.shape[0]), interpolation=cv2.INTER_NEAREST)
|
||||
mask = mask > 0.5
|
||||
|
||||
# Apply colored overlay
|
||||
color = colors.get(obj_id, (128, 128, 128))
|
||||
overlay = debug_frame.copy()
|
||||
overlay[mask] = color
|
||||
|
||||
# Blend with original (30% overlay, 70% original)
|
||||
cv2.addWeighted(overlay, 0.3, debug_frame, 0.7, 0, debug_frame)
|
||||
|
||||
# Draw outline
|
||||
contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
cv2.drawContours(debug_frame, contours, -1, color, 2)
|
||||
|
||||
logger.info(f"SAM2 Debug: Object {obj_id} mask - shape: {mask.shape}, pixels: {np.sum(mask)}")
|
||||
|
||||
# Add title
|
||||
title = f"SAM2 First Frame Masks: {len(frame_masks)} objects detected"
|
||||
cv2.putText(debug_frame, title, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
|
||||
|
||||
# Add mask source information
|
||||
source_info = "Mask Source: SAM2 (from YOLO bounding boxes)"
|
||||
cv2.putText(debug_frame, source_info, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
|
||||
|
||||
# Add object legend
|
||||
y_offset = 90
|
||||
for obj_id in sorted(frame_masks.keys()):
|
||||
color = colors.get(obj_id, (128, 128, 128))
|
||||
text = f"Object {obj_id}: {'Left Eye' if obj_id == 1 else 'Right Eye' if obj_id == 2 else f'Object {obj_id}'}"
|
||||
cv2.putText(debug_frame, text, (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
|
||||
y_offset += 30
|
||||
|
||||
# Save debug image
|
||||
success = cv2.imwrite(output_path, debug_frame)
|
||||
|
||||
# Cleanup
|
||||
self.predictor.reset_state(inference_state)
|
||||
import shutil
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
if success:
|
||||
logger.info(f"SAM2 Debug: Saved first frame masks to {output_path}")
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Failed to save first frame masks to {output_path}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating first frame debug masks: {e}")
|
||||
return False
|
||||
|
||||
def add_multi_frame_prompts_to_predictor(self, inference_state, multi_frame_prompts: Dict[int, List[Dict[str, Any]]]) -> bool:
|
||||
"""
|
||||
Add YOLO detection prompts at multiple frame indices for mid-segment re-detection.
|
||||
|
||||
Args:
|
||||
inference_state: SAM2 inference state
|
||||
multi_frame_prompts: Dictionary mapping frame_index -> list of prompt dictionaries
|
||||
|
||||
Returns:
|
||||
True if prompts were added successfully
|
||||
"""
|
||||
if not multi_frame_prompts:
|
||||
logger.warning("SAM2 Mid-segment: No multi-frame prompts provided")
|
||||
return False
|
||||
|
||||
total_prompts = sum(len(prompts) for prompts in multi_frame_prompts.values())
|
||||
logger.info(f"SAM2 Mid-segment: Adding {total_prompts} prompts across {len(multi_frame_prompts)} frames")
|
||||
|
||||
success_count = 0
|
||||
total_count = 0
|
||||
|
||||
for frame_idx, prompts in multi_frame_prompts.items():
|
||||
logger.info(f"SAM2 Mid-segment: Processing frame {frame_idx} with {len(prompts)} prompts")
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
obj_id = prompt['obj_id']
|
||||
bbox = prompt['bbox']
|
||||
confidence = prompt.get('confidence', 'unknown')
|
||||
total_count += 1
|
||||
|
||||
logger.info(f"SAM2 Mid-segment: Frame {frame_idx}, Prompt {i+1}/{len(prompts)}: Object {obj_id}, bbox={bbox}, conf={confidence}")
|
||||
|
||||
try:
|
||||
_, out_obj_ids, out_mask_logits = self.predictor.add_new_points_or_box(
|
||||
inference_state=inference_state,
|
||||
frame_idx=frame_idx, # Key: specify the exact frame index
|
||||
obj_id=obj_id,
|
||||
box=bbox.astype(np.float32),
|
||||
)
|
||||
|
||||
logger.info(f"SAM2 Mid-segment: ✓ Frame {frame_idx}, Object {obj_id} added successfully - returned obj_ids: {out_obj_ids}")
|
||||
success_count += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"SAM2 Mid-segment: ✗ Frame {frame_idx}, Object {obj_id} failed: {e}")
|
||||
continue
|
||||
|
||||
if success_count > 0:
|
||||
logger.info(f"SAM2 Mid-segment: Final result - {success_count}/{total_count} prompts successfully added across {len(multi_frame_prompts)} frames")
|
||||
return True
|
||||
else:
|
||||
logger.error("SAM2 Mid-segment: FAILED - No prompts were successfully added")
|
||||
return False
|
||||
@@ -7,31 +7,56 @@ import os
|
||||
import cv2
|
||||
import numpy as np
|
||||
import logging
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from ultralytics import YOLO
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class YOLODetector:
|
||||
"""Handles YOLO-based human detection for video segments."""
|
||||
"""Handles YOLO-based human detection for video segments with support for both detection and segmentation modes."""
|
||||
|
||||
def __init__(self, model_path: str, confidence_threshold: float = 0.6, human_class_id: int = 0):
|
||||
def __init__(self, detection_model_path: str = None, segmentation_model_path: str = None,
|
||||
mode: str = "detection", confidence_threshold: float = 0.6, human_class_id: int = 0):
|
||||
"""
|
||||
Initialize YOLO detector.
|
||||
Initialize YOLO detector with support for both detection and segmentation modes.
|
||||
|
||||
Args:
|
||||
model_path: Path to YOLO model weights
|
||||
detection_model_path: Path to YOLO detection model weights (e.g., yolov8n.pt)
|
||||
segmentation_model_path: Path to YOLO segmentation model weights (e.g., yolov8n-seg.pt)
|
||||
mode: Detection mode - "detection" for bboxes, "segmentation" for masks
|
||||
confidence_threshold: Detection confidence threshold
|
||||
human_class_id: COCO class ID for humans (0 = person)
|
||||
"""
|
||||
self.model_path = model_path
|
||||
self.mode = mode
|
||||
self.confidence_threshold = confidence_threshold
|
||||
self.human_class_id = human_class_id
|
||||
|
||||
# Select model path based on mode
|
||||
if mode == "segmentation":
|
||||
if not segmentation_model_path:
|
||||
raise ValueError("segmentation_model_path required for segmentation mode")
|
||||
self.model_path = segmentation_model_path
|
||||
self.supports_segmentation = True
|
||||
elif mode == "detection":
|
||||
if not detection_model_path:
|
||||
raise ValueError("detection_model_path required for detection mode")
|
||||
self.model_path = detection_model_path
|
||||
self.supports_segmentation = False
|
||||
else:
|
||||
raise ValueError(f"Invalid mode: {mode}. Must be 'detection' or 'segmentation'")
|
||||
|
||||
# Load YOLO model
|
||||
try:
|
||||
self.model = YOLO(model_path)
|
||||
logger.info(f"Loaded YOLO model from {model_path}")
|
||||
self.model = YOLO(self.model_path)
|
||||
logger.info(f"Loaded YOLO model in {mode} mode from {self.model_path}")
|
||||
|
||||
# Verify model capabilities
|
||||
if mode == "segmentation":
|
||||
# Test if model actually supports segmentation
|
||||
logger.info(f"YOLO Segmentation: Model loaded, will output direct masks")
|
||||
else:
|
||||
logger.info(f"YOLO Detection: Model loaded, will output bounding boxes")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load YOLO model: {e}")
|
||||
raise
|
||||
@@ -44,9 +69,9 @@ class YOLODetector:
|
||||
frame: Input frame (BGR format from OpenCV)
|
||||
|
||||
Returns:
|
||||
List of human detection dictionaries with bbox and confidence
|
||||
List of human detection dictionaries with bbox, confidence, and optionally masks
|
||||
"""
|
||||
# Run YOLO detection
|
||||
# Run YOLO detection/segmentation
|
||||
results = self.model(frame, conf=self.confidence_threshold, verbose=False)
|
||||
|
||||
human_detections = []
|
||||
@@ -54,8 +79,10 @@ class YOLODetector:
|
||||
# Process results
|
||||
for result in results:
|
||||
boxes = result.boxes
|
||||
masks = result.masks if hasattr(result, 'masks') and result.masks is not None else None
|
||||
|
||||
if boxes is not None:
|
||||
for box in boxes:
|
||||
for i, box in enumerate(boxes):
|
||||
# Get class ID
|
||||
cls = int(box.cls.cpu().numpy()[0])
|
||||
|
||||
@@ -65,12 +92,29 @@ class YOLODetector:
|
||||
coords = box.xyxy[0].cpu().numpy()
|
||||
conf = float(box.conf.cpu().numpy()[0])
|
||||
|
||||
human_detections.append({
|
||||
detection = {
|
||||
'bbox': coords,
|
||||
'confidence': conf
|
||||
})
|
||||
'confidence': conf,
|
||||
'has_mask': False,
|
||||
'mask': None
|
||||
}
|
||||
|
||||
logger.debug(f"Detected human with confidence {conf:.2f} at {coords}")
|
||||
# Extract mask if available (segmentation mode)
|
||||
if masks is not None and i < len(masks.data):
|
||||
mask_data = masks.data[i].cpu().numpy() # Get mask for this detection
|
||||
detection['has_mask'] = True
|
||||
detection['mask'] = mask_data
|
||||
logger.debug(f"YOLO Segmentation: Detected human with mask - conf={conf:.2f}, mask_shape={mask_data.shape}")
|
||||
else:
|
||||
logger.debug(f"YOLO Detection: Detected human with bbox - conf={conf:.2f}, bbox={coords}")
|
||||
|
||||
human_detections.append(detection)
|
||||
|
||||
if self.supports_segmentation:
|
||||
masks_found = sum(1 for d in human_detections if d['has_mask'])
|
||||
logger.info(f"YOLO Segmentation: Found {len(human_detections)} humans, {masks_found} with masks")
|
||||
else:
|
||||
logger.debug(f"YOLO Detection: Found {len(human_detections)} humans with bounding boxes")
|
||||
|
||||
return human_detections
|
||||
|
||||
@@ -153,25 +197,33 @@ class YOLODetector:
|
||||
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
# Skip comments and empty lines
|
||||
if line.startswith('#') or not line:
|
||||
content = f.read()
|
||||
|
||||
# Handle files with literal \n characters
|
||||
if '\\n' in content:
|
||||
lines = content.split('\\n')
|
||||
else:
|
||||
lines = content.split('\n')
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
# Skip comments and empty lines
|
||||
if line.startswith('#') or not line:
|
||||
continue
|
||||
|
||||
# Parse detection line: x1,y1,x2,y2,confidence
|
||||
parts = line.split(',')
|
||||
if len(parts) == 5:
|
||||
try:
|
||||
bbox = [float(x) for x in parts[:4]]
|
||||
conf = float(parts[4])
|
||||
detections.append({
|
||||
'bbox': np.array(bbox),
|
||||
'confidence': conf
|
||||
})
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid detection line: {line}")
|
||||
continue
|
||||
|
||||
# Parse detection line: x1,y1,x2,y2,confidence
|
||||
parts = line.split(',')
|
||||
if len(parts) == 5:
|
||||
try:
|
||||
bbox = [float(x) for x in parts[:4]]
|
||||
conf = float(parts[4])
|
||||
detections.append({
|
||||
'bbox': np.array(bbox),
|
||||
'confidence': conf
|
||||
})
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid detection line: {line}")
|
||||
continue
|
||||
|
||||
logger.info(f"Loaded {len(detections)} detections from {file_path}")
|
||||
except Exception as e:
|
||||
@@ -179,6 +231,120 @@ class YOLODetector:
|
||||
|
||||
return detections
|
||||
|
||||
def debug_detect_with_lower_confidence(self, frame: np.ndarray, debug_confidence: float = 0.3) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Run YOLO detection with a lower confidence threshold for debugging.
|
||||
This helps identify if detections are being missed due to high confidence threshold.
|
||||
|
||||
Args:
|
||||
frame: Input frame (BGR format from OpenCV)
|
||||
debug_confidence: Lower confidence threshold for debugging
|
||||
|
||||
Returns:
|
||||
List of human detection dictionaries with lower confidence threshold
|
||||
"""
|
||||
logger.info(f"VR180 Debug: Running YOLO with lower confidence {debug_confidence} (vs normal {self.confidence_threshold})")
|
||||
|
||||
# Run YOLO detection with lower confidence
|
||||
results = self.model(frame, conf=debug_confidence, verbose=False)
|
||||
|
||||
debug_detections = []
|
||||
|
||||
# Process results
|
||||
for result in results:
|
||||
boxes = result.boxes
|
||||
if boxes is not None:
|
||||
for box in boxes:
|
||||
# Get class ID
|
||||
cls = int(box.cls.cpu().numpy()[0])
|
||||
|
||||
# Check if it's a person (human_class_id)
|
||||
if cls == self.human_class_id:
|
||||
# Get bounding box coordinates (x1, y1, x2, y2)
|
||||
coords = box.xyxy[0].cpu().numpy()
|
||||
conf = float(box.conf.cpu().numpy()[0])
|
||||
|
||||
debug_detections.append({
|
||||
'bbox': coords,
|
||||
'confidence': conf
|
||||
})
|
||||
|
||||
logger.info(f"VR180 Debug: Lower confidence detection found {len(debug_detections)} total detections")
|
||||
return debug_detections
|
||||
|
||||
def detect_humans_multi_frame(self, video_path: str, frame_indices: List[int],
|
||||
scale: float = 1.0) -> Dict[int, List[Dict[str, Any]]]:
|
||||
"""
|
||||
Detect humans at multiple specific frame indices in a video.
|
||||
Used for mid-segment re-detection to improve SAM2 tracking.
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
frame_indices: List of frame indices to run detection on (e.g., [0, 30, 60, 90])
|
||||
scale: Scale factor for frame processing
|
||||
|
||||
Returns:
|
||||
Dictionary mapping frame_index -> list of detection dictionaries
|
||||
"""
|
||||
if not frame_indices:
|
||||
logger.warning("No frame indices provided for multi-frame detection")
|
||||
return {}
|
||||
|
||||
if not os.path.exists(video_path):
|
||||
logger.error(f"Video file not found: {video_path}")
|
||||
return {}
|
||||
|
||||
logger.info(f"Mid-segment Detection: Running YOLO on {len(frame_indices)} frames: {frame_indices}")
|
||||
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
if not cap.isOpened():
|
||||
logger.error(f"Could not open video: {video_path}")
|
||||
return {}
|
||||
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
|
||||
|
||||
# Filter out frame indices that are beyond video length
|
||||
valid_frame_indices = [idx for idx in frame_indices if 0 <= idx < total_frames]
|
||||
if len(valid_frame_indices) != len(frame_indices):
|
||||
invalid_frames = [idx for idx in frame_indices if idx not in valid_frame_indices]
|
||||
logger.warning(f"Mid-segment Detection: Skipping invalid frame indices: {invalid_frames} (video has {total_frames} frames)")
|
||||
|
||||
multi_frame_detections = {}
|
||||
|
||||
for frame_idx in valid_frame_indices:
|
||||
# Seek to specific frame
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
|
||||
ret, frame = cap.read()
|
||||
|
||||
if not ret:
|
||||
logger.warning(f"Mid-segment Detection: Could not read frame {frame_idx}")
|
||||
continue
|
||||
|
||||
# Scale frame if needed
|
||||
if scale != 1.0:
|
||||
frame = cv2.resize(frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
# Run YOLO detection on this frame
|
||||
detections = self.detect_humans_in_frame(frame)
|
||||
multi_frame_detections[frame_idx] = detections
|
||||
|
||||
# Log detection results
|
||||
time_seconds = frame_idx / fps
|
||||
logger.info(f"Mid-segment Detection: Frame {frame_idx} (t={time_seconds:.1f}s): {len(detections)} humans detected")
|
||||
|
||||
for i, detection in enumerate(detections):
|
||||
bbox = detection['bbox']
|
||||
conf = detection['confidence']
|
||||
logger.debug(f"Mid-segment Detection: Frame {frame_idx}, Human {i+1}: bbox={bbox}, conf={conf:.3f}")
|
||||
|
||||
cap.release()
|
||||
|
||||
total_detections = sum(len(dets) for dets in multi_frame_detections.values())
|
||||
logger.info(f"Mid-segment Detection: Complete - {total_detections} total detections across {len(valid_frame_indices)} frames")
|
||||
|
||||
return multi_frame_detections
|
||||
|
||||
def process_segments_batch(self, segments_info: List[dict], detect_segments: List[int],
|
||||
scale: float = 0.5) -> Dict[int, List[Dict[str, Any]]]:
|
||||
"""
|
||||
@@ -224,7 +390,8 @@ class YOLODetector:
|
||||
def convert_detections_to_sam2_prompts(self, detections: List[Dict[str, Any]],
|
||||
frame_width: int) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert YOLO detections to SAM2-compatible prompts for stereo video.
|
||||
Convert YOLO detections to SAM2-compatible prompts for VR180 SBS video.
|
||||
For VR180, we expect 2 real detections (left and right eye views), not mirrored ones.
|
||||
|
||||
Args:
|
||||
detections: List of YOLO detection results
|
||||
@@ -234,53 +401,335 @@ class YOLODetector:
|
||||
List of SAM2 prompt dictionaries with obj_id and bbox
|
||||
"""
|
||||
if not detections:
|
||||
logger.warning("No detections provided for SAM2 prompt conversion")
|
||||
return []
|
||||
|
||||
half_frame_width = frame_width // 2
|
||||
prompts = []
|
||||
|
||||
logger.info(f"VR180 SBS Debug: Converting {len(detections)} detections for frame width {frame_width}")
|
||||
logger.info(f"VR180 SBS Debug: Half frame width = {half_frame_width}")
|
||||
|
||||
# Sort detections by x-coordinate to get consistent left/right assignment
|
||||
sorted_detections = sorted(detections, key=lambda x: x['bbox'][0])
|
||||
|
||||
# Analyze detections by frame half
|
||||
left_detections = []
|
||||
right_detections = []
|
||||
|
||||
for i, detection in enumerate(sorted_detections):
|
||||
bbox = detection['bbox'].copy()
|
||||
center_x = (bbox[0] + bbox[2]) / 2
|
||||
pixel_range = f"{bbox[0]:.0f}-{bbox[2]:.0f}"
|
||||
|
||||
if center_x < half_frame_width:
|
||||
left_detections.append((detection, i, pixel_range))
|
||||
side = "LEFT"
|
||||
else:
|
||||
right_detections.append((detection, i, pixel_range))
|
||||
side = "RIGHT"
|
||||
|
||||
logger.info(f"VR180 SBS Debug: Detection {i}: pixels {pixel_range}, center_x={center_x:.1f}, side={side}")
|
||||
|
||||
# VR180 SBS Format Validation
|
||||
logger.info(f"VR180 SBS Debug: Found {len(left_detections)} LEFT detections, {len(right_detections)} RIGHT detections")
|
||||
|
||||
# Analyze confidence scores
|
||||
if left_detections:
|
||||
left_confidences = [det[0]['confidence'] for det in left_detections]
|
||||
logger.info(f"VR180 SBS Debug: LEFT eye confidences: {[f'{c:.3f}' for c in left_confidences]}")
|
||||
|
||||
if right_detections:
|
||||
right_confidences = [det[0]['confidence'] for det in right_detections]
|
||||
logger.info(f"VR180 SBS Debug: RIGHT eye confidences: {[f'{c:.3f}' for c in right_confidences]}")
|
||||
|
||||
if len(right_detections) == 0:
|
||||
logger.warning(f"VR180 SBS Warning: No detections found in RIGHT eye view (pixels {half_frame_width}-{frame_width})")
|
||||
logger.warning(f"VR180 SBS Warning: This may indicate:")
|
||||
logger.warning(f" 1. Person not visible in right eye view")
|
||||
logger.warning(f" 2. YOLO confidence threshold ({self.confidence_threshold}) too high")
|
||||
logger.warning(f" 3. VR180 SBS format issue")
|
||||
logger.warning(f" 4. Right eye view quality/lighting problems")
|
||||
logger.warning(f"VR180 SBS Suggestion: Try lowering yolo_confidence to 0.3-0.4 in config")
|
||||
|
||||
if len(left_detections) == 0:
|
||||
logger.warning(f"VR180 SBS Warning: No detections found in LEFT eye view (pixels 0-{half_frame_width})")
|
||||
|
||||
# Additional validation for VR180 SBS expectations
|
||||
total_detections = len(left_detections) + len(right_detections)
|
||||
if total_detections == 1:
|
||||
logger.warning(f"VR180 SBS Warning: Only 1 detection found - expected 2 for proper VR180 SBS")
|
||||
elif total_detections > 2:
|
||||
logger.warning(f"VR180 SBS Warning: {total_detections} detections found - will use only first 2")
|
||||
|
||||
# Assign object IDs sequentially, regardless of which half they're in
|
||||
# This ensures we always get Object 1 and Object 2 for up to 2 detections
|
||||
obj_id = 1
|
||||
|
||||
for i, detection in enumerate(sorted_detections[:2]): # Take up to 2 humans
|
||||
# Process up to 2 detections total (left + right combined)
|
||||
all_detections = sorted_detections[:2]
|
||||
|
||||
for i, detection in enumerate(all_detections):
|
||||
bbox = detection['bbox'].copy()
|
||||
center_x = (bbox[0] + bbox[2]) / 2
|
||||
pixel_range = f"{bbox[0]:.0f}-{bbox[2]:.0f}"
|
||||
|
||||
# For stereo videos, assign obj_id based on position
|
||||
if len(sorted_detections) >= 2:
|
||||
center_x = (bbox[0] + bbox[2]) / 2
|
||||
if center_x < half_frame_width:
|
||||
current_obj_id = 1 # Left human
|
||||
else:
|
||||
current_obj_id = 2 # Right human
|
||||
# Determine which eye view this detection is in
|
||||
if center_x < half_frame_width:
|
||||
eye_view = "LEFT"
|
||||
else:
|
||||
# If only one human, create prompts for both sides
|
||||
current_obj_id = obj_id
|
||||
obj_id += 1
|
||||
|
||||
# Create mirrored version for stereo
|
||||
if obj_id <= 2:
|
||||
mirrored_bbox = bbox.copy()
|
||||
mirrored_bbox[0] += half_frame_width # Shift x1
|
||||
mirrored_bbox[2] += half_frame_width # Shift x2
|
||||
|
||||
# Ensure mirrored bbox is within frame bounds
|
||||
mirrored_bbox[0] = max(0, min(mirrored_bbox[0], frame_width - 1))
|
||||
mirrored_bbox[2] = max(0, min(mirrored_bbox[2], frame_width - 1))
|
||||
|
||||
prompts.append({
|
||||
'obj_id': obj_id,
|
||||
'bbox': mirrored_bbox,
|
||||
'confidence': detection['confidence']
|
||||
})
|
||||
obj_id += 1
|
||||
eye_view = "RIGHT"
|
||||
|
||||
prompts.append({
|
||||
'obj_id': current_obj_id,
|
||||
'obj_id': obj_id,
|
||||
'bbox': bbox,
|
||||
'confidence': detection['confidence']
|
||||
})
|
||||
|
||||
logger.info(f"VR180 SBS Debug: Added {eye_view} eye detection as SAM2 Object {obj_id}")
|
||||
logger.info(f"VR180 SBS Debug: Object {obj_id} bbox: {bbox} (pixels {pixel_range})")
|
||||
|
||||
obj_id += 1
|
||||
|
||||
logger.debug(f"Converted {len(detections)} detections to {len(prompts)} SAM2 prompts")
|
||||
return prompts
|
||||
logger.info(f"VR180 SBS Debug: Final result - {len(detections)} YOLO detections → {len(prompts)} SAM2 prompts")
|
||||
|
||||
# Verify we have the expected objects
|
||||
obj_ids = [p['obj_id'] for p in prompts]
|
||||
logger.info(f"VR180 SBS Debug: SAM2 Object IDs created: {obj_ids}")
|
||||
|
||||
return prompts
|
||||
|
||||
def convert_yolo_masks_to_video_segments(self, detections: List[Dict[str, Any]],
|
||||
frame_width: int, target_frame_shape: Tuple[int, int] = None) -> Optional[Dict[int, Dict[int, np.ndarray]]]:
|
||||
"""
|
||||
Convert YOLO segmentation masks to SAM2-compatible video segments format.
|
||||
This allows using YOLO masks directly without SAM2 processing.
|
||||
|
||||
Args:
|
||||
detections: List of YOLO detection results with masks
|
||||
frame_width: Width of the video frame for VR180 object ID assignment
|
||||
target_frame_shape: Target shape (height, width) for mask resizing
|
||||
|
||||
Returns:
|
||||
Video segments dictionary compatible with SAM2 output format, or None if no masks
|
||||
"""
|
||||
if not detections:
|
||||
logger.warning("No detections provided for mask conversion")
|
||||
return None
|
||||
|
||||
# Check if any detections have masks
|
||||
detections_with_masks = [d for d in detections if d.get('has_mask', False)]
|
||||
if not detections_with_masks:
|
||||
logger.warning("No detections have masks - YOLO segmentation may not be working")
|
||||
return None
|
||||
|
||||
logger.info(f"YOLO Mask Conversion: Converting {len(detections_with_masks)} YOLO masks to video segments format")
|
||||
|
||||
half_frame_width = frame_width // 2
|
||||
video_segments = {}
|
||||
|
||||
# Create frame 0 with converted masks
|
||||
frame_masks = {}
|
||||
obj_id = 1
|
||||
|
||||
# Sort detections by x-coordinate for consistent VR180 SBS assignment
|
||||
sorted_detections = sorted(detections_with_masks, key=lambda x: x['bbox'][0])
|
||||
|
||||
for i, detection in enumerate(sorted_detections[:2]): # Take up to 2 humans
|
||||
mask = detection['mask']
|
||||
bbox = detection['bbox']
|
||||
center_x = (bbox[0] + bbox[2]) / 2
|
||||
|
||||
# Assign sequential object IDs (similar to prompt conversion logic)
|
||||
current_obj_id = obj_id
|
||||
|
||||
# Determine which eye view for logging
|
||||
if center_x < half_frame_width:
|
||||
eye_view = "LEFT"
|
||||
else:
|
||||
eye_view = "RIGHT"
|
||||
|
||||
# Resize mask to target frame shape if specified
|
||||
if target_frame_shape and mask.shape != target_frame_shape:
|
||||
mask_resized = cv2.resize(mask.astype(np.float32), (target_frame_shape[1], target_frame_shape[0]), interpolation=cv2.INTER_NEAREST)
|
||||
mask = (mask_resized > 0.5).astype(bool)
|
||||
else:
|
||||
mask = mask.astype(bool)
|
||||
|
||||
frame_masks[current_obj_id] = mask
|
||||
|
||||
logger.info(f"YOLO Mask Conversion: {eye_view} eye detection -> Object {current_obj_id}, mask_shape={mask.shape}, pixels={np.sum(mask)}")
|
||||
|
||||
obj_id += 1 # Always increment for next detection
|
||||
|
||||
# Store masks in video segments format (single frame)
|
||||
video_segments[0] = frame_masks
|
||||
|
||||
total_objects = len(frame_masks)
|
||||
total_pixels = sum(np.sum(mask) for mask in frame_masks.values())
|
||||
logger.info(f"YOLO Mask Conversion: Created video segments with {total_objects} objects, {total_pixels} total mask pixels")
|
||||
|
||||
return video_segments
|
||||
|
||||
def save_debug_frame_with_detections(self, frame: np.ndarray, detections: List[Dict[str, Any]],
|
||||
output_path: str, prompts: List[Dict[str, Any]] = None) -> bool:
|
||||
"""
|
||||
Save a debug frame with YOLO detections and SAM2 prompts overlaid as bounding boxes.
|
||||
|
||||
Args:
|
||||
frame: Input frame (BGR format from OpenCV)
|
||||
detections: List of detection dictionaries with bbox and confidence
|
||||
output_path: Path to save the debug image
|
||||
prompts: Optional list of SAM2 prompt dictionaries with obj_id and bbox
|
||||
|
||||
Returns:
|
||||
True if saved successfully
|
||||
"""
|
||||
try:
|
||||
debug_frame = frame.copy()
|
||||
|
||||
# Draw masks (if available) or bounding boxes for each detection
|
||||
for i, detection in enumerate(detections):
|
||||
bbox = detection['bbox']
|
||||
confidence = detection['confidence']
|
||||
has_mask = detection.get('has_mask', False)
|
||||
|
||||
# Extract coordinates
|
||||
x1, y1, x2, y2 = map(int, bbox)
|
||||
|
||||
# Choose color based on confidence (green for high, yellow for medium, red for low)
|
||||
if confidence >= 0.8:
|
||||
color = (0, 255, 0) # Green
|
||||
elif confidence >= 0.6:
|
||||
color = (0, 255, 255) # Yellow
|
||||
else:
|
||||
color = (0, 0, 255) # Red
|
||||
|
||||
if has_mask and 'mask' in detection:
|
||||
# Draw segmentation mask
|
||||
mask = detection['mask']
|
||||
|
||||
# Resize mask to match frame if needed
|
||||
if mask.shape != debug_frame.shape[:2]:
|
||||
mask = cv2.resize(mask.astype(np.float32), (debug_frame.shape[1], debug_frame.shape[0]), interpolation=cv2.INTER_NEAREST)
|
||||
mask = mask > 0.5
|
||||
|
||||
mask = mask.astype(bool)
|
||||
|
||||
# Apply colored overlay with transparency
|
||||
overlay = debug_frame.copy()
|
||||
overlay[mask] = color
|
||||
cv2.addWeighted(overlay, 0.3, debug_frame, 0.7, 0, debug_frame)
|
||||
|
||||
# Draw mask outline
|
||||
contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
cv2.drawContours(debug_frame, contours, -1, color, 2)
|
||||
|
||||
# Prepare label text for segmentation
|
||||
label = f"Person {i+1}: {confidence:.2f} (MASK)"
|
||||
else:
|
||||
# Draw bounding box (detection mode or no mask available)
|
||||
cv2.rectangle(debug_frame, (x1, y1), (x2, y2), color, 2)
|
||||
|
||||
# Prepare label text for detection
|
||||
label = f"Person {i+1}: {confidence:.2f} (BBOX)"
|
||||
|
||||
label_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
|
||||
|
||||
# Draw label background
|
||||
cv2.rectangle(debug_frame,
|
||||
(x1, y1 - label_size[1] - 10),
|
||||
(x1 + label_size[0], y1),
|
||||
color, -1)
|
||||
|
||||
# Draw label text
|
||||
cv2.putText(debug_frame, label,
|
||||
(x1, y1 - 5),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.6,
|
||||
(255, 255, 255), 2)
|
||||
|
||||
# Draw SAM2 prompts if provided (with different colors/style)
|
||||
if prompts:
|
||||
for prompt in prompts:
|
||||
obj_id = prompt['obj_id']
|
||||
bbox = prompt['bbox']
|
||||
|
||||
# Extract coordinates
|
||||
x1, y1, x2, y2 = map(int, bbox)
|
||||
|
||||
# Use different colors for each object ID
|
||||
if obj_id == 1:
|
||||
prompt_color = (0, 255, 0) # Green for Object 1
|
||||
elif obj_id == 2:
|
||||
prompt_color = (255, 0, 0) # Blue for Object 2
|
||||
else:
|
||||
prompt_color = (255, 255, 0) # Cyan for others
|
||||
|
||||
# Draw thicker, dashed-style border for SAM2 prompts
|
||||
thickness = 3
|
||||
cv2.rectangle(debug_frame, (x1-2, y1-2), (x2+2, y2+2), prompt_color, thickness)
|
||||
|
||||
# Add SAM2 object ID label
|
||||
sam_label = f"SAM2 Obj {obj_id}"
|
||||
label_size = cv2.getTextSize(sam_label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0]
|
||||
|
||||
# Draw label background
|
||||
cv2.rectangle(debug_frame,
|
||||
(x1-2, y2+5),
|
||||
(x1-2 + label_size[0], y2+5 + label_size[1] + 5),
|
||||
prompt_color, -1)
|
||||
|
||||
# Draw label text
|
||||
cv2.putText(debug_frame, sam_label,
|
||||
(x1-2, y2+5 + label_size[1]),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5,
|
||||
(255, 255, 255), 2)
|
||||
|
||||
# Draw VR180 SBS boundary line (center line separating left and right eye views)
|
||||
frame_height, frame_width = debug_frame.shape[:2]
|
||||
center_x = frame_width // 2
|
||||
cv2.line(debug_frame, (center_x, 0), (center_x, frame_height), (0, 255, 255), 3) # Yellow line
|
||||
|
||||
# Add VR180 SBS labels
|
||||
cv2.putText(debug_frame, "LEFT EYE", (10, frame_height - 20),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)
|
||||
cv2.putText(debug_frame, "RIGHT EYE", (center_x + 10, frame_height - 20),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)
|
||||
|
||||
# Add summary text at top with mode information
|
||||
mode_text = f"YOLO Mode: {self.mode.upper()}"
|
||||
masks_available = sum(1 for d in detections if d.get('has_mask', False))
|
||||
|
||||
if self.supports_segmentation and masks_available > 0:
|
||||
summary = f"VR180 SBS: {len(detections)} detections → {masks_available} MASKS (for SAM2 propagation)"
|
||||
else:
|
||||
summary = f"VR180 SBS: {len(detections)} detections → {len(prompts) if prompts else 0} SAM2 prompts"
|
||||
|
||||
cv2.putText(debug_frame, mode_text,
|
||||
(10, 30),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.8,
|
||||
(0, 255, 255), 2) # Yellow for mode
|
||||
cv2.putText(debug_frame, summary,
|
||||
(10, 60),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 1.0,
|
||||
(255, 255, 255), 2)
|
||||
|
||||
# Add frame dimensions info
|
||||
dims_info = f"Frame: {frame_width}x{frame_height}, Center: {center_x}"
|
||||
cv2.putText(debug_frame, dims_info,
|
||||
(10, 90),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.6,
|
||||
(255, 255, 255), 2)
|
||||
|
||||
# Save debug frame
|
||||
success = cv2.imwrite(output_path, debug_frame)
|
||||
if success:
|
||||
logger.info(f"Saved YOLO debug frame to {output_path}")
|
||||
else:
|
||||
logger.error(f"Failed to save debug frame to {output_path}")
|
||||
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating debug frame: {e}")
|
||||
return False
|
||||
Reference in New Issue
Block a user