Files
samyolo_on_segments/main.py
2025-07-31 09:09:22 -07:00

1066 lines
51 KiB
Python

#!/usr/bin/env python3
"""
Main entry point for YOLO + SAM2 video processing pipeline.
Processes long videos by splitting into segments, detecting humans with YOLO,
and creating green screen masks with SAM2.
"""
import os
import sys
import argparse
import cv2
import numpy as np
from typing import List
# Add project root to path
sys.path.append(os.path.dirname(__file__))
from core.config_loader import ConfigLoader
from core.video_splitter import VideoSplitter
from core.yolo_detector import YOLODetector
from core.sam2_processor import SAM2Processor
from core.mask_processor import MaskProcessor
from core.video_assembler import VideoAssembler
from utils.logging_utils import setup_logging, get_logger
from utils.file_utils import ensure_directory
from utils.status_utils import print_processing_status, cleanup_incomplete_segment
logger = get_logger(__name__)
def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="YOLO + SAM2 Video Processing Pipeline"
)
parser.add_argument(
"--config",
type=str,
required=True,
help="Path to YAML configuration file"
)
parser.add_argument(
"--log-file",
type=str,
help="Optional log file path"
)
parser.add_argument(
"--status",
action="store_true",
help="Show processing status and exit"
)
parser.add_argument(
"--cleanup-segment",
type=int,
help="Clean up a specific segment for restart (segment index)"
)
return parser.parse_args()
def validate_dependencies():
"""Validate that required dependencies are available."""
try:
import torch
import cv2
import numpy as np
import cupy as cp
from ultralytics import YOLO
from sam2.build_sam import build_sam2_video_predictor
logger.info("All dependencies validated successfully")
return True
except ImportError as e:
logger.error(f"Missing dependency: {e}")
logger.error("Please install requirements: pip install -r requirements.txt")
return False
def create_yolo_mask_debug_frame(detections: List[dict], video_path: str, output_path: str, scale: float = 1.0) -> bool:
"""
Create debug visualization for YOLO direct masks.
Args:
detections: List of YOLO detections with masks
video_path: Path to video file
output_path: Path to save debug image
scale: Scale factor for frame processing
Returns:
True if debug frame was created successfully
"""
try:
# Load first frame
cap = cv2.VideoCapture(video_path)
ret, original_frame = cap.read()
cap.release()
if not ret:
logger.error("Could not read first frame for YOLO mask debug")
return False
# Scale frame if needed
if scale != 1.0:
original_frame = cv2.resize(original_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
debug_frame = original_frame.copy()
# Define colors for each object
colors = {
1: (0, 255, 0), # Green for Object 1 (Left eye)
2: (255, 0, 0), # Blue for Object 2 (Right eye)
}
# Get detections with masks
detections_with_masks = [d for d in detections if d.get('has_mask', False)]
# Overlay masks with transparency
obj_id = 1
for detection in detections_with_masks[:2]: # Up to 2 objects
mask = detection['mask']
# Resize mask to match frame if needed
if mask.shape != original_frame.shape[:2]:
mask = cv2.resize(mask.astype(np.float32), (original_frame.shape[1], original_frame.shape[0]), interpolation=cv2.INTER_NEAREST)
mask = mask > 0.5
mask = mask.astype(bool)
# Apply colored overlay
color = colors.get(obj_id, (128, 128, 128))
overlay = debug_frame.copy()
overlay[mask] = color
# Blend with original (30% overlay, 70% original)
cv2.addWeighted(overlay, 0.3, debug_frame, 0.7, 0, debug_frame)
# Draw outline
contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cv2.drawContours(debug_frame, contours, -1, color, 2)
logger.info(f"YOLO Mask Debug: Object {obj_id} mask - shape: {mask.shape}, pixels: {np.sum(mask)}")
obj_id += 1
# Add title and source info
title = f"YOLO Direct Masks: {len(detections_with_masks)} objects detected"
cv2.putText(debug_frame, title, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
source_info = "Mask Source: YOLO Segmentation (DIRECT - No SAM2)"
cv2.putText(debug_frame, source_info, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) # Green for YOLO
# Add object legend
y_offset = 90
for i, detection in enumerate(detections_with_masks[:2]):
obj_id = i + 1
color = colors.get(obj_id, (128, 128, 128))
text = f"Object {obj_id}: {'Left Eye' if obj_id == 1 else 'Right Eye'} (YOLO Mask)"
cv2.putText(debug_frame, text, (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
y_offset += 30
# Save debug image
success = cv2.imwrite(output_path, debug_frame)
if success:
logger.info(f"YOLO Mask Debug: Saved debug frame to {output_path}")
else:
logger.error(f"Failed to save YOLO mask debug frame to {output_path}")
return success
except Exception as e:
logger.error(f"Error creating YOLO mask debug frame: {e}")
return False
def resolve_detect_segments(detect_segments, total_segments: int) -> List[int]:
"""
Resolve detect_segments configuration to list of segment indices.
Args:
detect_segments: Configuration value ("all", list, or None)
total_segments: Total number of segments
Returns:
List of segment indices to process
"""
if detect_segments == "all" or detect_segments is None:
return list(range(total_segments))
elif isinstance(detect_segments, list):
# Filter out invalid segment indices
valid_segments = [s for s in detect_segments if 0 <= s < total_segments]
if len(valid_segments) != len(detect_segments):
logger.warning(f"Some segment indices are invalid. Using: {valid_segments}")
return valid_segments
else:
logger.warning(f"Invalid detect_segments format: {detect_segments}. Using all segments.")
return list(range(total_segments))
def process_segment_with_separate_eyes(segment_info, detector, sam2_processor, mask_processor, config,
previous_left_masks=None, previous_right_masks=None):
"""
Process a single segment using separate eye processing mode.
Split video first, then run YOLO independently on each eye.
Args:
segment_info: Segment information dictionary
detector: YOLO detector instance
sam2_processor: SAM2 processor with eye processing enabled
mask_processor: Mask processor instance
config: Configuration loader instance
previous_left_masks: Previous masks for left eye
previous_right_masks: Previous masks for right eye
Returns:
Tuple of (success, left_masks, right_masks)
"""
segment_idx = segment_info['index']
logger.info(f"VR180 Separate Eyes: Processing segment {segment_idx} (video-split approach)")
# Get video properties
cap = cv2.VideoCapture(segment_info['video_file'])
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
full_frame_shape = (frame_height, frame_width)
# Step 1: Split the segment video into left and right eye videos
left_eye_video = os.path.join(segment_info['directory'], "left_eye.mp4")
right_eye_video = os.path.join(segment_info['directory'], "right_eye.mp4")
logger.info(f"VR180 Separate Eyes: Splitting segment video into eye videos")
success = sam2_processor.eye_processor.split_video_into_eyes(
segment_info['video_file'],
left_eye_video,
right_eye_video,
scale=config.get_inference_scale()
)
if not success:
logger.error(f"VR180 Separate Eyes: Failed to split video for segment {segment_idx}")
return False, None, None
# Check if both eye videos were created
if not os.path.exists(left_eye_video) or not os.path.exists(right_eye_video):
logger.error(f"VR180 Separate Eyes: Eye video files not created for segment {segment_idx}")
return False, None, None
logger.info(f"VR180 Separate Eyes: Created eye videos - left: {left_eye_video}, right: {right_eye_video}")
# Step 2: Run YOLO independently on each eye video
left_detections = detector.detect_humans_in_video_first_frame(
left_eye_video, scale=1.0 # Already scaled during video splitting
)
right_detections = detector.detect_humans_in_video_first_frame(
right_eye_video, scale=1.0 # Already scaled during video splitting
)
logger.info(f"VR180 Separate Eyes: YOLO detections - left: {len(left_detections)}, right: {len(right_detections)}")
# Check if we have YOLO segmentation masks
has_yolo_masks = False
if detector.supports_segmentation:
has_yolo_masks = any(d.get('has_mask', False) for d in (left_detections + right_detections))
if has_yolo_masks:
logger.info(f"VR180 Separate Eyes: YOLO segmentation mode - using direct masks instead of bounding boxes")
# Save eye-specific debug frames if enabled
if config.get('advanced.save_yolo_debug_frames', False) and (left_detections or right_detections):
try:
# Load first frames from each eye video
left_cap = cv2.VideoCapture(left_eye_video)
ret_left, left_frame = left_cap.read()
left_cap.release()
right_cap = cv2.VideoCapture(right_eye_video)
ret_right, right_frame = right_cap.read()
right_cap.release()
if ret_left and ret_right:
# Save eye-specific debug frames
left_debug_path = os.path.join(segment_info['directory'], "left_eye_debug.jpg")
right_debug_path = os.path.join(segment_info['directory'], "right_eye_debug.jpg")
detector.save_eye_debug_frames(
left_frame, right_frame,
left_detections, right_detections,
left_debug_path, right_debug_path
)
logger.info(f"VR180 Separate Eyes: Saved eye-specific debug frames for segment {segment_idx}")
else:
logger.warning(f"VR180 Separate Eyes: Could not load eye frames for debug visualization")
except Exception as e:
logger.warning(f"VR180 Separate Eyes: Failed to create eye debug frames: {e}")
# Step 3: Process left eye if detections exist or we have previous masks
left_masks = None
if left_detections or previous_left_masks:
try:
left_prompts = None
left_initial_masks = None
if left_detections:
if has_yolo_masks:
# YOLO segmentation mode: convert masks to initial masks for SAM2
left_initial_masks = {}
for i, detection in enumerate(left_detections):
if detection.get('has_mask', False):
mask = detection['mask']
left_initial_masks[1] = mask.astype(bool) # Always use obj_id=1 for single eye
logger.info(f"VR180 Separate Eyes: Left eye YOLO mask - shape: {mask.shape}, pixels: {np.sum(mask)}")
break # Only take the first/best mask for single eye processing
if left_initial_masks:
logger.info(f"VR180 Separate Eyes: Left eye - using YOLO segmentation masks as initial masks")
else:
# YOLO detection mode: convert bounding boxes to prompts
left_prompts = detector.convert_detections_to_sam2_prompts(left_detections, frame_width // 2)
logger.info(f"VR180 Separate Eyes: Left eye - {len(left_prompts)} SAM2 prompts")
# Create temporary segment info for left eye processing
left_segment_info = segment_info.copy()
left_segment_info['video_file'] = left_eye_video
left_masks = sam2_processor.process_single_eye_segment(
left_segment_info, 'left', left_prompts,
left_initial_masks or previous_left_masks,
1.0 # Scale already applied during video splitting
)
if left_masks:
logger.info(f"VR180 Separate Eyes: Left eye processed - {len(left_masks)} frame masks")
else:
logger.warning(f"VR180 Separate Eyes: Left eye processing failed")
except Exception as e:
logger.error(f"VR180 Separate Eyes: Error processing left eye for segment {segment_idx}: {e}")
left_masks = None
# Step 4: Process right eye if detections exist or we have previous masks
right_masks = None
if right_detections or previous_right_masks:
try:
right_prompts = None
right_initial_masks = None
if right_detections:
if has_yolo_masks:
# YOLO segmentation mode: convert masks to initial masks for SAM2
right_initial_masks = {}
for i, detection in enumerate(right_detections):
if detection.get('has_mask', False):
mask = detection['mask']
right_initial_masks[1] = mask.astype(bool) # Always use obj_id=1 for single eye
logger.info(f"VR180 Separate Eyes: Right eye YOLO mask - shape: {mask.shape}, pixels: {np.sum(mask)}")
break # Only take the first/best mask for single eye processing
if right_initial_masks:
logger.info(f"VR180 Separate Eyes: Right eye - using YOLO segmentation masks as initial masks")
else:
# YOLO detection mode: convert bounding boxes to prompts
right_prompts = detector.convert_detections_to_sam2_prompts(right_detections, frame_width // 2)
logger.info(f"VR180 Separate Eyes: Right eye - {len(right_prompts)} SAM2 prompts")
# Create temporary segment info for right eye processing
right_segment_info = segment_info.copy()
right_segment_info['video_file'] = right_eye_video
right_masks = sam2_processor.process_single_eye_segment(
right_segment_info, 'right', right_prompts,
right_initial_masks or previous_right_masks,
1.0 # Scale already applied during video splitting
)
if right_masks:
logger.info(f"VR180 Separate Eyes: Right eye processed - {len(right_masks)} frame masks")
else:
logger.warning(f"VR180 Separate Eyes: Right eye processing failed")
except Exception as e:
logger.error(f"VR180 Separate Eyes: Error processing right eye for segment {segment_idx}: {e}")
right_masks = None
# Step 5: Check if we got any valid masks
if not left_masks and not right_masks:
logger.warning(f"VR180 Separate Eyes: Neither eye produced valid masks for segment {segment_idx}")
if config.get('processing.enable_greenscreen_fallback', True):
logger.info(f"VR180 Separate Eyes: Using greenscreen fallback for segment {segment_idx}")
success = mask_processor.process_greenscreen_only_segment(
segment_info,
green_color=config.get_green_color(),
use_nvenc=config.get_use_nvenc(),
bitrate=config.get_output_bitrate()
)
return success, None, None
else:
logger.error(f"VR180 Separate Eyes: No masks generated and greenscreen fallback disabled")
return False, None, None
# Step 6: Combine masks back to full frame format
try:
logger.info(f"VR180 Separate Eyes: Combining eye masks for segment {segment_idx}")
combined_masks = sam2_processor.eye_processor.combine_eye_masks(
left_masks, right_masks, full_frame_shape
)
if not combined_masks:
logger.error(f"VR180 Separate Eyes: Failed to combine eye masks for segment {segment_idx}")
return False, left_masks, right_masks
# Validate combined masks have reasonable content
total_mask_pixels = 0
for frame_idx, frame_masks in combined_masks.items():
for obj_id, mask in frame_masks.items():
if mask is not None:
total_mask_pixels += np.sum(mask)
if total_mask_pixels == 0:
logger.warning(f"VR180 Separate Eyes: Combined masks are empty for segment {segment_idx}")
if config.get('processing.enable_greenscreen_fallback', True):
logger.info(f"VR180 Separate Eyes: Using greenscreen fallback due to empty masks")
success = mask_processor.process_greenscreen_only_segment(
segment_info,
green_color=config.get_green_color(),
use_nvenc=config.get_use_nvenc(),
bitrate=config.get_output_bitrate()
)
return success, left_masks, right_masks
logger.info(f"VR180 Separate Eyes: Combined masks contain {total_mask_pixels} total pixels")
except Exception as e:
logger.error(f"VR180 Separate Eyes: Error combining eye masks for segment {segment_idx}: {e}")
# Try greenscreen fallback if mask combination fails
if config.get('processing.enable_greenscreen_fallback', True):
logger.info(f"VR180 Separate Eyes: Using greenscreen fallback due to mask combination error")
success = mask_processor.process_greenscreen_only_segment(
segment_info,
green_color=config.get_green_color(),
use_nvenc=config.get_use_nvenc(),
bitrate=config.get_output_bitrate()
)
return success, left_masks, right_masks
else:
return False, left_masks, right_masks
# Step 7: Save combined masks
mask_path = os.path.join(segment_info['directory'], "mask.png")
sam2_processor.save_final_masks(
combined_masks,
mask_path,
green_color=config.get_green_color(),
blue_color=config.get_blue_color()
)
# Step 8: Apply green screen and save output video
success = mask_processor.process_segment(
segment_info,
combined_masks,
use_nvenc=config.get_use_nvenc(),
bitrate=config.get_output_bitrate()
)
if success:
logger.info(f"VR180 Separate Eyes: Successfully processed segment {segment_idx}")
else:
logger.error(f"VR180 Separate Eyes: Failed to create output video for segment {segment_idx}")
# Clean up temporary eye video files
try:
if os.path.exists(left_eye_video):
os.remove(left_eye_video)
if os.path.exists(right_eye_video):
os.remove(right_eye_video)
logger.debug(f"VR180 Separate Eyes: Cleaned up temporary eye videos for segment {segment_idx}")
except Exception as e:
logger.warning(f"VR180 Separate Eyes: Failed to clean up temporary eye videos: {e}")
return success, left_masks, right_masks
async def main_async():
"""Main processing pipeline with async optimizations."""
args = parse_arguments()
try:
# Load configuration
config = ConfigLoader(args.config)
# Setup logging
setup_logging(config.get_log_level(), args.log_file)
# Handle status check
if args.status:
output_dir = config.get_output_directory()
input_video = config.get_input_video_path()
video_name = os.path.splitext(os.path.basename(input_video))[0]
segments_dir = os.path.join(output_dir, f"{video_name}_segments")
print_processing_status(segments_dir)
return 0
# Handle segment cleanup
if args.cleanup_segment is not None:
output_dir = config.get_output_directory()
input_video = config.get_input_video_path()
video_name = os.path.splitext(os.path.basename(input_video))[0]
segments_dir = os.path.join(output_dir, f"{video_name}_segments")
segment_dir = os.path.join(segments_dir, f"segment_{args.cleanup_segment}")
if cleanup_incomplete_segment(segment_dir):
logger.info(f"Successfully cleaned up segment {args.cleanup_segment}")
return 0
else:
logger.error(f"Failed to clean up segment {args.cleanup_segment}")
return 1
logger.info("Starting YOLO + SAM2 video processing pipeline")
# Validate dependencies
if not validate_dependencies():
return 1
# Validate input video exists
input_video = config.get_input_video_path()
if not os.path.exists(input_video):
logger.error(f"Input video not found: {input_video}")
return 1
# Setup output directory
output_dir = config.get_output_directory()
ensure_directory(output_dir)
# Step 1: Split video into segments
logger.info("Step 1: Splitting video into segments")
splitter = VideoSplitter(
segment_duration=config.get_segment_duration(),
force_keyframes=config.get('video.force_keyframes', True)
)
segments_dir, segment_dirs = splitter.split_video(input_video, output_dir)
logger.info(f"Created {len(segment_dirs)} segments in {segments_dir}")
# Get detailed segment information
segments_info = splitter.get_segment_info(segments_dir)
# Resolve which segments to process with YOLO
detect_segments_config = config.get_detect_segments()
detect_segments = resolve_detect_segments(detect_segments_config, len(segments_info))
# Initialize processors once
logger.info("Step 2: Initializing YOLO detector")
# Get YOLO mode and model paths
yolo_mode = config.get('models.yolo_mode', 'detection')
detection_model = config.get('models.yolo_detection_model', config.get_yolo_model_path())
segmentation_model = config.get('models.yolo_segmentation_model', None)
logger.info(f"YOLO Mode: {yolo_mode}")
detector = YOLODetector(
detection_model_path=detection_model,
segmentation_model_path=segmentation_model,
mode=yolo_mode,
confidence_threshold=config.get_yolo_confidence(),
human_class_id=config.get_human_class_id()
)
logger.info("Step 3: Initializing SAM2 processor")
# Check if separate eye processing is enabled
separate_eye_processing = config.get('processing.separate_eye_processing', False)
eye_overlap_pixels = config.get('processing.eye_overlap_pixels', 0)
enable_greenscreen_fallback = config.get('processing.enable_greenscreen_fallback', True)
# Initialize async preprocessor if enabled
async_preprocessor = None
if config.get('advanced.enable_background_lowres_generation', False):
from core.async_lowres_preprocessor import AsyncLowResPreprocessor
max_concurrent = config.get('advanced.max_concurrent_lowres', 3)
segments_ahead = config.get('advanced.lowres_segments_ahead', 3)
use_ffmpeg = config.get('advanced.use_ffmpeg_lowres', True)
async_preprocessor = AsyncLowResPreprocessor(
max_concurrent=max_concurrent,
segments_ahead=segments_ahead,
use_ffmpeg=use_ffmpeg
)
logger.info(f"Async low-res preprocessing: ENABLED (max_concurrent={max_concurrent}, segments_ahead={segments_ahead})")
else:
logger.info("Async low-res preprocessing: DISABLED")
if separate_eye_processing:
logger.info("VR180 Separate Eye Processing: ENABLED")
logger.info(f"Eye overlap pixels: {eye_overlap_pixels}")
logger.info(f"Greenscreen fallback: {enable_greenscreen_fallback}")
sam2_processor = SAM2Processor(
checkpoint_path=config.get_sam2_checkpoint(),
config_path=config.get_sam2_config(),
vos_optimized=config.get('models.sam2_vos_optimized', False),
separate_eye_processing=separate_eye_processing,
eye_overlap_pixels=eye_overlap_pixels,
async_preprocessor=async_preprocessor
)
# Initialize mask processor with quality enhancements
mask_quality_config = config.get('mask_processing', {})
mask_processor = MaskProcessor(
green_color=config.get_green_color(),
blue_color=config.get_blue_color(),
mask_quality_config=mask_quality_config
)
# Process each segment sequentially (YOLO -> SAM2 -> Render)
logger.info("Step 4: Processing segments sequentially")
total_humans_detected = 0
# Start background low-res video preprocessing if enabled
if async_preprocessor:
logger.info("Starting background low-res video preprocessing")
async_preprocessor.start_background_preparation(
segments_info,
config.get_inference_scale(),
separate_eye_processing,
current_segment=0
)
# Initialize previous masks for separate eye processing
previous_left_masks = None
previous_right_masks = None
for i, segment_info in enumerate(segments_info):
segment_idx = segment_info['index']
logger.info(f"Processing segment {segment_idx}/{len(segments_info)-1}")
# Start background preparation for upcoming segments
if async_preprocessor and i < len(segments_info) - 1:
async_preprocessor.start_background_preparation(
segments_info,
config.get_inference_scale(),
separate_eye_processing,
current_segment=i
)
# Reset temporal history for new segment
mask_processor.reset_temporal_history()
# Skip if segment output already exists
output_video = os.path.join(segment_info['directory'], f"output_{segment_idx}.mp4")
if os.path.exists(output_video):
logger.info(f"Segment {segment_idx} already processed, skipping")
continue
# Branch based on processing mode
if separate_eye_processing:
# Use separate eye processing mode
success, left_masks, right_masks = process_segment_with_separate_eyes(
segment_info, detector, sam2_processor, mask_processor, config,
previous_left_masks, previous_right_masks
)
# Update previous masks for next segment
previous_left_masks = left_masks
previous_right_masks = right_masks
if success:
logger.info(f"Successfully processed segment {segment_idx} with separate eye processing")
else:
logger.error(f"Failed to process segment {segment_idx} with separate eye processing")
continue # Skip the original processing logic
# Determine if we should use YOLO detections or previous masks
use_detections = segment_idx in detect_segments
# First segment must use detections
if segment_idx == 0 and not use_detections:
logger.warning(f"First segment must use YOLO detection")
use_detections = True
# Get YOLO prompts or previous masks
yolo_prompts = None
previous_masks = None
if use_detections:
# Run YOLO detection on current segment
logger.info(f"Running YOLO detection on segment {segment_idx}")
detection_file = os.path.join(segment_info['directory'], "yolo_detections")
# Check if detection already exists
if os.path.exists(detection_file):
logger.info(f"Loading existing YOLO detections for segment {segment_idx}")
detections = detector.load_detections_from_file(detection_file)
else:
# Run YOLO detection on first frame
detections = detector.detect_humans_in_video_first_frame(
segment_info['video_file'],
scale=config.get_inference_scale()
)
# Save detections for future runs
detector.save_detections_to_file(detections, detection_file)
if detections:
total_humans_detected += len(detections)
logger.info(f"Found {len(detections)} humans in segment {segment_idx}")
# Get frame width from video
cap = cv2.VideoCapture(segment_info['video_file'])
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
cap.release()
yolo_prompts = detector.convert_detections_to_sam2_prompts(
detections, frame_width
)
# If no right eye detections found, run debug analysis with lower confidence
half_frame_width = frame_width // 2
right_eye_detections = [d for d in detections if (d['bbox'][0] + d['bbox'][2]) / 2 >= half_frame_width]
if len(right_eye_detections) == 0 and config.get('advanced.save_yolo_debug_frames', False):
logger.info(f"VR180 Debug: No right eye detections found, running lower confidence analysis...")
# Load first frame for debug analysis
cap = cv2.VideoCapture(segment_info['video_file'])
ret, debug_frame = cap.read()
cap.release()
if ret:
# Scale frame to match detection scale
if config.get_inference_scale() != 1.0:
scale = config.get_inference_scale()
debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
# Run debug detection with lower confidence
debug_detections = detector.debug_detect_with_lower_confidence(debug_frame, debug_confidence=0.3)
# Analyze where these lower confidence detections are
debug_right_eye = [d for d in debug_detections if (d['bbox'][0] + d['bbox'][2]) / 2 >= half_frame_width]
if len(debug_right_eye) > 0:
logger.warning(f"VR180 Debug: Found {len(debug_right_eye)} right eye detections with lower confidence!")
for i, det in enumerate(debug_right_eye):
logger.warning(f"VR180 Debug: Right eye detection {i+1}: conf={det['confidence']:.3f}, bbox={det['bbox']}")
logger.warning(f"VR180 Debug: Consider lowering yolo_confidence from {config.get_yolo_confidence()} to 0.3-0.4")
else:
logger.info(f"VR180 Debug: No right eye detections found even with confidence 0.3")
logger.info(f"VR180 Debug: This confirms person is not visible in right eye view")
logger.info(f"Pipeline Debug: Segment {segment_idx} - Generated {len(yolo_prompts)} SAM2 prompts from {len(detections)} YOLO detections")
# Save debug frame with detections visualized (if enabled)
if config.get('advanced.save_yolo_debug_frames', False):
debug_frame_path = os.path.join(segment_info['directory'], "yolo_debug.jpg")
# Load first frame for debug visualization
cap = cv2.VideoCapture(segment_info['video_file'])
ret, debug_frame = cap.read()
cap.release()
if ret:
# Scale frame to match detection scale
if config.get_inference_scale() != 1.0:
scale = config.get_inference_scale()
debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
detector.save_debug_frame_with_detections(debug_frame, detections, debug_frame_path, yolo_prompts)
else:
logger.warning(f"Could not load frame for debug visualization in segment {segment_idx}")
# Check if we have YOLO masks for debug visualization
has_yolo_masks = False
if detections and detector.supports_segmentation:
has_yolo_masks = any(d.get('has_mask', False) for d in detections)
# Generate first frame masks debug (SAM2 or YOLO)
first_frame_debug_path = os.path.join(segment_info['directory'], "first_frame_detection.jpg")
if has_yolo_masks:
logger.info(f"Pipeline Debug: Generating YOLO first frame masks for segment {segment_idx}")
# Create YOLO mask debug visualization
create_yolo_mask_debug_frame(detections, segment_info['video_file'], first_frame_debug_path, config.get_inference_scale())
else:
logger.info(f"Pipeline Debug: Generating SAM2 first frame masks for segment {segment_idx}")
sam2_processor.generate_first_frame_debug_masks(
segment_info['video_file'],
yolo_prompts,
first_frame_debug_path,
config.get_inference_scale()
)
else:
logger.warning(f"No humans detected in segment {segment_idx}")
# Save debug frame even when no detections (if enabled)
if config.get('advanced.save_yolo_debug_frames', False):
debug_frame_path = os.path.join(segment_info['directory'], "yolo_debug_no_detections.jpg")
# Load first frame for debug visualization
cap = cv2.VideoCapture(segment_info['video_file'])
ret, debug_frame = cap.read()
cap.release()
if ret:
# Scale frame to match detection scale
if config.get_inference_scale() != 1.0:
scale = config.get_inference_scale()
debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
# Add "No detections" text overlay
cv2.putText(debug_frame, "YOLO: No humans detected",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 1.0,
(0, 0, 255), 2) # Red text
cv2.imwrite(debug_frame_path, debug_frame)
logger.info(f"Saved no-detection debug frame to {debug_frame_path}")
else:
logger.warning(f"Could not load frame for no-detection debug visualization in segment {segment_idx}")
elif segment_idx > 0:
# Try to load previous segment mask
for j in range(segment_idx - 1, -1, -1):
prev_segment_dir = segments_info[j]['directory']
previous_masks = sam2_processor.load_previous_segment_mask(prev_segment_dir)
if previous_masks:
logger.info(f"Using masks from segment {j} for segment {segment_idx}")
break
if not yolo_prompts and not previous_masks:
logger.error(f"No prompts or previous masks available for segment {segment_idx}")
continue
# Check if we have YOLO masks and can skip SAM2 (recheck in case detections were loaded from file)
if not 'has_yolo_masks' in locals():
has_yolo_masks = False
if detections and detector.supports_segmentation:
has_yolo_masks = any(d.get('has_mask', False) for d in detections)
if has_yolo_masks:
logger.info(f"Pipeline Debug: YOLO segmentation provided masks - using as SAM2 initial masks for segment {segment_idx}")
# Convert YOLO masks to initial masks for SAM2
cap = cv2.VideoCapture(segment_info['video_file'])
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
# Convert YOLO masks to the format expected by SAM2 add_previous_masks_to_predictor
yolo_masks_dict = {}
for i, detection in enumerate(detections[:2]): # Up to 2 objects
if detection.get('has_mask', False):
mask = detection['mask']
# Resize mask to match inference scale
if config.get_inference_scale() != 1.0:
scale = config.get_inference_scale()
scaled_height = int(frame_height * scale)
scaled_width = int(frame_width * scale)
mask = cv2.resize(mask.astype(np.float32), (scaled_width, scaled_height), interpolation=cv2.INTER_NEAREST)
mask = mask > 0.5
obj_id = i + 1 # Sequential object IDs
yolo_masks_dict[obj_id] = mask.astype(bool)
logger.info(f"Pipeline Debug: YOLO mask for Object {obj_id} - shape: {mask.shape}, pixels: {np.sum(mask)}")
logger.info(f"Pipeline Debug: Using YOLO masks as SAM2 initial masks - {len(yolo_masks_dict)} objects")
# Use traditional SAM2 pipeline with YOLO masks as initial masks
previous_masks = yolo_masks_dict
yolo_prompts = None # Don't use bounding box prompts when we have masks
# Debug what we're passing to SAM2
if yolo_prompts:
logger.info(f"Pipeline Debug: Passing {len(yolo_prompts)} YOLO prompts to SAM2 for segment {segment_idx}")
for i, prompt in enumerate(yolo_prompts):
logger.info(f"Pipeline Debug: Prompt {i+1}: Object {prompt['obj_id']}, bbox={prompt['bbox']}")
if previous_masks:
logger.info(f"Pipeline Debug: Using {len(previous_masks)} previous masks for segment {segment_idx}")
logger.info(f"Pipeline Debug: Previous mask object IDs: {list(previous_masks.keys())}")
# Handle mid-segment detection if enabled (works for both detection and segmentation modes)
multi_frame_prompts = None
if config.get('advanced.enable_mid_segment_detection', False) and (yolo_prompts or has_yolo_masks):
logger.info(f"Mid-segment Detection: Enabled for segment {segment_idx}")
# Calculate frame indices for re-detection
cap = cv2.VideoCapture(segment_info['video_file'])
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
cap.release()
redetection_interval = config.get('advanced.redetection_interval', 30)
max_redetections = config.get('advanced.max_redetections_per_segment', 10)
# Generate frame indices: [30, 60, 90, ...] (skip frame 0 since we already have first frame prompts)
frame_indices = []
frame_idx = redetection_interval
while frame_idx < total_frames and len(frame_indices) < max_redetections:
frame_indices.append(frame_idx)
frame_idx += redetection_interval
if frame_indices:
logger.info(f"Mid-segment Detection: Running YOLO on frames {frame_indices} (interval={redetection_interval})")
# Run multi-frame detection
multi_frame_detections = detector.detect_humans_multi_frame(
segment_info['video_file'],
frame_indices,
scale=config.get_inference_scale()
)
# Convert detections to SAM2 prompts (different handling for segmentation vs detection mode)
multi_frame_prompts = {}
cap = cv2.VideoCapture(segment_info['video_file'])
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
for frame_idx, detections in multi_frame_detections.items():
if detections:
if has_yolo_masks:
# Segmentation mode: convert YOLO masks to SAM2 mask prompts
frame_masks = {}
for i, detection in enumerate(detections[:2]): # Up to 2 objects
if detection.get('has_mask', False):
mask = detection['mask']
# Resize mask to match inference scale
if config.get_inference_scale() != 1.0:
scale = config.get_inference_scale()
scaled_height = int(frame_height * scale)
scaled_width = int(frame_width * scale)
mask = cv2.resize(mask.astype(np.float32), (scaled_width, scaled_height), interpolation=cv2.INTER_NEAREST)
mask = mask > 0.5
obj_id = i + 1 # Sequential object IDs
frame_masks[obj_id] = mask.astype(bool)
logger.debug(f"Mid-segment Detection: Frame {frame_idx}, Object {obj_id} mask - shape: {mask.shape}, pixels: {np.sum(mask)}")
if frame_masks:
# Store as mask prompts (different format than bbox prompts)
multi_frame_prompts[frame_idx] = {'masks': frame_masks}
logger.info(f"Mid-segment Detection: Frame {frame_idx} -> {len(frame_masks)} YOLO masks")
else:
# Detection mode: convert to bounding box prompts (existing logic)
prompts = detector.convert_detections_to_sam2_prompts(detections, frame_width)
multi_frame_prompts[frame_idx] = prompts
logger.info(f"Mid-segment Detection: Frame {frame_idx} -> {len(prompts)} SAM2 prompts")
logger.info(f"Mid-segment Detection: Generated prompts for {len(multi_frame_prompts)} frames")
else:
logger.info(f"Mid-segment Detection: No additional frames to process (segment has {total_frames} frames)")
elif config.get('advanced.enable_mid_segment_detection', False):
logger.info(f"Mid-segment Detection: Skipped for segment {segment_idx} (no initial YOLO data)")
# Process segment with SAM2
logger.info(f"Pipeline Debug: Starting SAM2 processing for segment {segment_idx}")
video_segments = sam2_processor.process_single_segment(
segment_info,
yolo_prompts=yolo_prompts,
previous_masks=previous_masks,
inference_scale=config.get_inference_scale(),
multi_frame_prompts=multi_frame_prompts
)
if video_segments is None:
logger.error(f"SAM2 processing failed for segment {segment_idx}")
continue
# Check if SAM2 produced adequate results
if len(video_segments) == 0:
logger.error(f"SAM2 produced no frames for segment {segment_idx}")
continue
elif len(video_segments) < 10: # Expected many frames for a 5-second segment
logger.warning(f"SAM2 produced very few frames ({len(video_segments)}) for segment {segment_idx} - this may indicate propagation failure")
# Debug what SAM2 produced
logger.info(f"Pipeline Debug: SAM2 completed for segment {segment_idx}")
logger.info(f"Pipeline Debug: Generated masks for {len(video_segments)} frames")
if video_segments:
# Check first frame to see what objects were tracked
first_frame_idx = min(video_segments.keys())
first_frame_objects = video_segments[first_frame_idx]
logger.info(f"Pipeline Debug: First frame contains {len(first_frame_objects)} tracked objects")
logger.info(f"Pipeline Debug: Tracked object IDs: {list(first_frame_objects.keys())}")
for obj_id, mask in first_frame_objects.items():
mask_pixels = np.sum(mask)
logger.info(f"Pipeline Debug: Object {obj_id} mask has {mask_pixels} pixels")
# Check last frame as well
last_frame_idx = max(video_segments.keys())
last_frame_objects = video_segments[last_frame_idx]
logger.info(f"Pipeline Debug: Last frame contains {len(last_frame_objects)} tracked objects")
logger.info(f"Pipeline Debug: Final object IDs: {list(last_frame_objects.keys())}")
# Save final masks for next segment
mask_path = os.path.join(segment_info['directory'], "mask.png")
sam2_processor.save_final_masks(
video_segments,
mask_path,
green_color=config.get_green_color(),
blue_color=config.get_blue_color()
)
# Apply green screen and save output video
success = mask_processor.process_segment(
segment_info,
video_segments,
use_nvenc=config.get_use_nvenc(),
bitrate=config.get_output_bitrate()
)
if success:
logger.info(f"Successfully processed segment {segment_idx}")
else:
logger.error(f"Failed to create green screen video for segment {segment_idx}")
# Log processing summary
logger.info(f"Sequential processing complete. Total humans detected: {total_humans_detected}")
# Step 3: Assemble final video
logger.info("Step 3: Assembling final video with audio")
# Initialize video assembler
assembler = VideoAssembler(
preserve_audio=config.get_preserve_audio(),
use_nvenc=config.get_use_nvenc()
)
# Verify all segments are complete
all_complete, missing = assembler.verify_segment_completeness(segments_dir)
if not all_complete:
logger.error(f"Cannot assemble video - missing segments: {missing}")
return 1
# Assemble final video
final_output = os.path.join(output_dir, config.get_output_filename())
success = assembler.assemble_final_video(
segments_dir,
input_video,
final_output,
bitrate=config.get_output_bitrate()
)
if success:
logger.info(f"Final video saved to: {final_output}")
logger.info("Pipeline completed successfully")
return 0
except Exception as e:
logger.error(f"Pipeline failed: {e}", exc_info=True)
return 1
finally:
# Cleanup async preprocessor if it was used
if async_preprocessor:
async_preprocessor.cleanup()
logger.debug("Async preprocessor cleanup completed")
def main():
"""Main entry point - wrapper for async main."""
import asyncio
return asyncio.run(main_async())
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)