1066 lines
51 KiB
Python
1066 lines
51 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Main entry point for YOLO + SAM2 video processing pipeline.
|
|
Processes long videos by splitting into segments, detecting humans with YOLO,
|
|
and creating green screen masks with SAM2.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import argparse
|
|
import cv2
|
|
import numpy as np
|
|
from typing import List
|
|
|
|
# Add project root to path
|
|
sys.path.append(os.path.dirname(__file__))
|
|
|
|
from core.config_loader import ConfigLoader
|
|
from core.video_splitter import VideoSplitter
|
|
from core.yolo_detector import YOLODetector
|
|
from core.sam2_processor import SAM2Processor
|
|
from core.mask_processor import MaskProcessor
|
|
from core.video_assembler import VideoAssembler
|
|
from utils.logging_utils import setup_logging, get_logger
|
|
from utils.file_utils import ensure_directory
|
|
from utils.status_utils import print_processing_status, cleanup_incomplete_segment
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
def parse_arguments():
|
|
"""Parse command line arguments."""
|
|
parser = argparse.ArgumentParser(
|
|
description="YOLO + SAM2 Video Processing Pipeline"
|
|
)
|
|
parser.add_argument(
|
|
"--config",
|
|
type=str,
|
|
required=True,
|
|
help="Path to YAML configuration file"
|
|
)
|
|
parser.add_argument(
|
|
"--log-file",
|
|
type=str,
|
|
help="Optional log file path"
|
|
)
|
|
parser.add_argument(
|
|
"--status",
|
|
action="store_true",
|
|
help="Show processing status and exit"
|
|
)
|
|
parser.add_argument(
|
|
"--cleanup-segment",
|
|
type=int,
|
|
help="Clean up a specific segment for restart (segment index)"
|
|
)
|
|
return parser.parse_args()
|
|
|
|
def validate_dependencies():
|
|
"""Validate that required dependencies are available."""
|
|
try:
|
|
import torch
|
|
import cv2
|
|
import numpy as np
|
|
import cupy as cp
|
|
from ultralytics import YOLO
|
|
from sam2.build_sam import build_sam2_video_predictor
|
|
logger.info("All dependencies validated successfully")
|
|
return True
|
|
except ImportError as e:
|
|
logger.error(f"Missing dependency: {e}")
|
|
logger.error("Please install requirements: pip install -r requirements.txt")
|
|
return False
|
|
|
|
def create_yolo_mask_debug_frame(detections: List[dict], video_path: str, output_path: str, scale: float = 1.0) -> bool:
|
|
"""
|
|
Create debug visualization for YOLO direct masks.
|
|
|
|
Args:
|
|
detections: List of YOLO detections with masks
|
|
video_path: Path to video file
|
|
output_path: Path to save debug image
|
|
scale: Scale factor for frame processing
|
|
|
|
Returns:
|
|
True if debug frame was created successfully
|
|
"""
|
|
try:
|
|
# Load first frame
|
|
cap = cv2.VideoCapture(video_path)
|
|
ret, original_frame = cap.read()
|
|
cap.release()
|
|
|
|
if not ret:
|
|
logger.error("Could not read first frame for YOLO mask debug")
|
|
return False
|
|
|
|
# Scale frame if needed
|
|
if scale != 1.0:
|
|
original_frame = cv2.resize(original_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
|
|
|
|
debug_frame = original_frame.copy()
|
|
|
|
# Define colors for each object
|
|
colors = {
|
|
1: (0, 255, 0), # Green for Object 1 (Left eye)
|
|
2: (255, 0, 0), # Blue for Object 2 (Right eye)
|
|
}
|
|
|
|
# Get detections with masks
|
|
detections_with_masks = [d for d in detections if d.get('has_mask', False)]
|
|
|
|
# Overlay masks with transparency
|
|
obj_id = 1
|
|
for detection in detections_with_masks[:2]: # Up to 2 objects
|
|
mask = detection['mask']
|
|
|
|
# Resize mask to match frame if needed
|
|
if mask.shape != original_frame.shape[:2]:
|
|
mask = cv2.resize(mask.astype(np.float32), (original_frame.shape[1], original_frame.shape[0]), interpolation=cv2.INTER_NEAREST)
|
|
mask = mask > 0.5
|
|
|
|
mask = mask.astype(bool)
|
|
|
|
# Apply colored overlay
|
|
color = colors.get(obj_id, (128, 128, 128))
|
|
overlay = debug_frame.copy()
|
|
overlay[mask] = color
|
|
|
|
# Blend with original (30% overlay, 70% original)
|
|
cv2.addWeighted(overlay, 0.3, debug_frame, 0.7, 0, debug_frame)
|
|
|
|
# Draw outline
|
|
contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
cv2.drawContours(debug_frame, contours, -1, color, 2)
|
|
|
|
logger.info(f"YOLO Mask Debug: Object {obj_id} mask - shape: {mask.shape}, pixels: {np.sum(mask)}")
|
|
obj_id += 1
|
|
|
|
# Add title and source info
|
|
title = f"YOLO Direct Masks: {len(detections_with_masks)} objects detected"
|
|
cv2.putText(debug_frame, title, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
|
|
|
|
source_info = "Mask Source: YOLO Segmentation (DIRECT - No SAM2)"
|
|
cv2.putText(debug_frame, source_info, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) # Green for YOLO
|
|
|
|
# Add object legend
|
|
y_offset = 90
|
|
for i, detection in enumerate(detections_with_masks[:2]):
|
|
obj_id = i + 1
|
|
color = colors.get(obj_id, (128, 128, 128))
|
|
text = f"Object {obj_id}: {'Left Eye' if obj_id == 1 else 'Right Eye'} (YOLO Mask)"
|
|
cv2.putText(debug_frame, text, (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
|
|
y_offset += 30
|
|
|
|
# Save debug image
|
|
success = cv2.imwrite(output_path, debug_frame)
|
|
if success:
|
|
logger.info(f"YOLO Mask Debug: Saved debug frame to {output_path}")
|
|
else:
|
|
logger.error(f"Failed to save YOLO mask debug frame to {output_path}")
|
|
|
|
return success
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating YOLO mask debug frame: {e}")
|
|
return False
|
|
|
|
def resolve_detect_segments(detect_segments, total_segments: int) -> List[int]:
|
|
"""
|
|
Resolve detect_segments configuration to list of segment indices.
|
|
|
|
Args:
|
|
detect_segments: Configuration value ("all", list, or None)
|
|
total_segments: Total number of segments
|
|
|
|
Returns:
|
|
List of segment indices to process
|
|
"""
|
|
if detect_segments == "all" or detect_segments is None:
|
|
return list(range(total_segments))
|
|
elif isinstance(detect_segments, list):
|
|
# Filter out invalid segment indices
|
|
valid_segments = [s for s in detect_segments if 0 <= s < total_segments]
|
|
if len(valid_segments) != len(detect_segments):
|
|
logger.warning(f"Some segment indices are invalid. Using: {valid_segments}")
|
|
return valid_segments
|
|
else:
|
|
logger.warning(f"Invalid detect_segments format: {detect_segments}. Using all segments.")
|
|
return list(range(total_segments))
|
|
|
|
def process_segment_with_separate_eyes(segment_info, detector, sam2_processor, mask_processor, config,
|
|
previous_left_masks=None, previous_right_masks=None):
|
|
"""
|
|
Process a single segment using separate eye processing mode.
|
|
Split video first, then run YOLO independently on each eye.
|
|
|
|
Args:
|
|
segment_info: Segment information dictionary
|
|
detector: YOLO detector instance
|
|
sam2_processor: SAM2 processor with eye processing enabled
|
|
mask_processor: Mask processor instance
|
|
config: Configuration loader instance
|
|
previous_left_masks: Previous masks for left eye
|
|
previous_right_masks: Previous masks for right eye
|
|
|
|
Returns:
|
|
Tuple of (success, left_masks, right_masks)
|
|
"""
|
|
segment_idx = segment_info['index']
|
|
logger.info(f"VR180 Separate Eyes: Processing segment {segment_idx} (video-split approach)")
|
|
|
|
# Get video properties
|
|
cap = cv2.VideoCapture(segment_info['video_file'])
|
|
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|
cap.release()
|
|
|
|
full_frame_shape = (frame_height, frame_width)
|
|
|
|
# Step 1: Split the segment video into left and right eye videos
|
|
left_eye_video = os.path.join(segment_info['directory'], "left_eye.mp4")
|
|
right_eye_video = os.path.join(segment_info['directory'], "right_eye.mp4")
|
|
|
|
logger.info(f"VR180 Separate Eyes: Splitting segment video into eye videos")
|
|
success = sam2_processor.eye_processor.split_video_into_eyes(
|
|
segment_info['video_file'],
|
|
left_eye_video,
|
|
right_eye_video,
|
|
scale=config.get_inference_scale()
|
|
)
|
|
|
|
if not success:
|
|
logger.error(f"VR180 Separate Eyes: Failed to split video for segment {segment_idx}")
|
|
return False, None, None
|
|
|
|
# Check if both eye videos were created
|
|
if not os.path.exists(left_eye_video) or not os.path.exists(right_eye_video):
|
|
logger.error(f"VR180 Separate Eyes: Eye video files not created for segment {segment_idx}")
|
|
return False, None, None
|
|
|
|
logger.info(f"VR180 Separate Eyes: Created eye videos - left: {left_eye_video}, right: {right_eye_video}")
|
|
|
|
# Step 2: Run YOLO independently on each eye video
|
|
left_detections = detector.detect_humans_in_video_first_frame(
|
|
left_eye_video, scale=1.0 # Already scaled during video splitting
|
|
)
|
|
|
|
right_detections = detector.detect_humans_in_video_first_frame(
|
|
right_eye_video, scale=1.0 # Already scaled during video splitting
|
|
)
|
|
|
|
logger.info(f"VR180 Separate Eyes: YOLO detections - left: {len(left_detections)}, right: {len(right_detections)}")
|
|
|
|
# Check if we have YOLO segmentation masks
|
|
has_yolo_masks = False
|
|
if detector.supports_segmentation:
|
|
has_yolo_masks = any(d.get('has_mask', False) for d in (left_detections + right_detections))
|
|
|
|
if has_yolo_masks:
|
|
logger.info(f"VR180 Separate Eyes: YOLO segmentation mode - using direct masks instead of bounding boxes")
|
|
|
|
# Save eye-specific debug frames if enabled
|
|
if config.get('advanced.save_yolo_debug_frames', False) and (left_detections or right_detections):
|
|
try:
|
|
# Load first frames from each eye video
|
|
left_cap = cv2.VideoCapture(left_eye_video)
|
|
ret_left, left_frame = left_cap.read()
|
|
left_cap.release()
|
|
|
|
right_cap = cv2.VideoCapture(right_eye_video)
|
|
ret_right, right_frame = right_cap.read()
|
|
right_cap.release()
|
|
|
|
if ret_left and ret_right:
|
|
# Save eye-specific debug frames
|
|
left_debug_path = os.path.join(segment_info['directory'], "left_eye_debug.jpg")
|
|
right_debug_path = os.path.join(segment_info['directory'], "right_eye_debug.jpg")
|
|
|
|
detector.save_eye_debug_frames(
|
|
left_frame, right_frame,
|
|
left_detections, right_detections,
|
|
left_debug_path, right_debug_path
|
|
)
|
|
|
|
logger.info(f"VR180 Separate Eyes: Saved eye-specific debug frames for segment {segment_idx}")
|
|
else:
|
|
logger.warning(f"VR180 Separate Eyes: Could not load eye frames for debug visualization")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"VR180 Separate Eyes: Failed to create eye debug frames: {e}")
|
|
|
|
# Step 3: Process left eye if detections exist or we have previous masks
|
|
left_masks = None
|
|
if left_detections or previous_left_masks:
|
|
try:
|
|
left_prompts = None
|
|
left_initial_masks = None
|
|
|
|
if left_detections:
|
|
if has_yolo_masks:
|
|
# YOLO segmentation mode: convert masks to initial masks for SAM2
|
|
left_initial_masks = {}
|
|
for i, detection in enumerate(left_detections):
|
|
if detection.get('has_mask', False):
|
|
mask = detection['mask']
|
|
left_initial_masks[1] = mask.astype(bool) # Always use obj_id=1 for single eye
|
|
logger.info(f"VR180 Separate Eyes: Left eye YOLO mask - shape: {mask.shape}, pixels: {np.sum(mask)}")
|
|
break # Only take the first/best mask for single eye processing
|
|
|
|
if left_initial_masks:
|
|
logger.info(f"VR180 Separate Eyes: Left eye - using YOLO segmentation masks as initial masks")
|
|
else:
|
|
# YOLO detection mode: convert bounding boxes to prompts
|
|
left_prompts = detector.convert_detections_to_sam2_prompts(left_detections, frame_width // 2)
|
|
logger.info(f"VR180 Separate Eyes: Left eye - {len(left_prompts)} SAM2 prompts")
|
|
|
|
# Create temporary segment info for left eye processing
|
|
left_segment_info = segment_info.copy()
|
|
left_segment_info['video_file'] = left_eye_video
|
|
|
|
left_masks = sam2_processor.process_single_eye_segment(
|
|
left_segment_info, 'left', left_prompts,
|
|
left_initial_masks or previous_left_masks,
|
|
1.0 # Scale already applied during video splitting
|
|
)
|
|
|
|
if left_masks:
|
|
logger.info(f"VR180 Separate Eyes: Left eye processed - {len(left_masks)} frame masks")
|
|
else:
|
|
logger.warning(f"VR180 Separate Eyes: Left eye processing failed")
|
|
|
|
except Exception as e:
|
|
logger.error(f"VR180 Separate Eyes: Error processing left eye for segment {segment_idx}: {e}")
|
|
left_masks = None
|
|
|
|
# Step 4: Process right eye if detections exist or we have previous masks
|
|
right_masks = None
|
|
if right_detections or previous_right_masks:
|
|
try:
|
|
right_prompts = None
|
|
right_initial_masks = None
|
|
|
|
if right_detections:
|
|
if has_yolo_masks:
|
|
# YOLO segmentation mode: convert masks to initial masks for SAM2
|
|
right_initial_masks = {}
|
|
for i, detection in enumerate(right_detections):
|
|
if detection.get('has_mask', False):
|
|
mask = detection['mask']
|
|
right_initial_masks[1] = mask.astype(bool) # Always use obj_id=1 for single eye
|
|
logger.info(f"VR180 Separate Eyes: Right eye YOLO mask - shape: {mask.shape}, pixels: {np.sum(mask)}")
|
|
break # Only take the first/best mask for single eye processing
|
|
|
|
if right_initial_masks:
|
|
logger.info(f"VR180 Separate Eyes: Right eye - using YOLO segmentation masks as initial masks")
|
|
else:
|
|
# YOLO detection mode: convert bounding boxes to prompts
|
|
right_prompts = detector.convert_detections_to_sam2_prompts(right_detections, frame_width // 2)
|
|
logger.info(f"VR180 Separate Eyes: Right eye - {len(right_prompts)} SAM2 prompts")
|
|
|
|
# Create temporary segment info for right eye processing
|
|
right_segment_info = segment_info.copy()
|
|
right_segment_info['video_file'] = right_eye_video
|
|
|
|
right_masks = sam2_processor.process_single_eye_segment(
|
|
right_segment_info, 'right', right_prompts,
|
|
right_initial_masks or previous_right_masks,
|
|
1.0 # Scale already applied during video splitting
|
|
)
|
|
|
|
if right_masks:
|
|
logger.info(f"VR180 Separate Eyes: Right eye processed - {len(right_masks)} frame masks")
|
|
else:
|
|
logger.warning(f"VR180 Separate Eyes: Right eye processing failed")
|
|
|
|
except Exception as e:
|
|
logger.error(f"VR180 Separate Eyes: Error processing right eye for segment {segment_idx}: {e}")
|
|
right_masks = None
|
|
|
|
# Step 5: Check if we got any valid masks
|
|
if not left_masks and not right_masks:
|
|
logger.warning(f"VR180 Separate Eyes: Neither eye produced valid masks for segment {segment_idx}")
|
|
|
|
if config.get('processing.enable_greenscreen_fallback', True):
|
|
logger.info(f"VR180 Separate Eyes: Using greenscreen fallback for segment {segment_idx}")
|
|
success = mask_processor.process_greenscreen_only_segment(
|
|
segment_info,
|
|
green_color=config.get_green_color(),
|
|
use_nvenc=config.get_use_nvenc(),
|
|
bitrate=config.get_output_bitrate()
|
|
)
|
|
return success, None, None
|
|
else:
|
|
logger.error(f"VR180 Separate Eyes: No masks generated and greenscreen fallback disabled")
|
|
return False, None, None
|
|
|
|
# Step 6: Combine masks back to full frame format
|
|
try:
|
|
logger.info(f"VR180 Separate Eyes: Combining eye masks for segment {segment_idx}")
|
|
combined_masks = sam2_processor.eye_processor.combine_eye_masks(
|
|
left_masks, right_masks, full_frame_shape
|
|
)
|
|
|
|
if not combined_masks:
|
|
logger.error(f"VR180 Separate Eyes: Failed to combine eye masks for segment {segment_idx}")
|
|
return False, left_masks, right_masks
|
|
|
|
# Validate combined masks have reasonable content
|
|
total_mask_pixels = 0
|
|
for frame_idx, frame_masks in combined_masks.items():
|
|
for obj_id, mask in frame_masks.items():
|
|
if mask is not None:
|
|
total_mask_pixels += np.sum(mask)
|
|
|
|
if total_mask_pixels == 0:
|
|
logger.warning(f"VR180 Separate Eyes: Combined masks are empty for segment {segment_idx}")
|
|
if config.get('processing.enable_greenscreen_fallback', True):
|
|
logger.info(f"VR180 Separate Eyes: Using greenscreen fallback due to empty masks")
|
|
success = mask_processor.process_greenscreen_only_segment(
|
|
segment_info,
|
|
green_color=config.get_green_color(),
|
|
use_nvenc=config.get_use_nvenc(),
|
|
bitrate=config.get_output_bitrate()
|
|
)
|
|
return success, left_masks, right_masks
|
|
|
|
logger.info(f"VR180 Separate Eyes: Combined masks contain {total_mask_pixels} total pixels")
|
|
|
|
except Exception as e:
|
|
logger.error(f"VR180 Separate Eyes: Error combining eye masks for segment {segment_idx}: {e}")
|
|
# Try greenscreen fallback if mask combination fails
|
|
if config.get('processing.enable_greenscreen_fallback', True):
|
|
logger.info(f"VR180 Separate Eyes: Using greenscreen fallback due to mask combination error")
|
|
success = mask_processor.process_greenscreen_only_segment(
|
|
segment_info,
|
|
green_color=config.get_green_color(),
|
|
use_nvenc=config.get_use_nvenc(),
|
|
bitrate=config.get_output_bitrate()
|
|
)
|
|
return success, left_masks, right_masks
|
|
else:
|
|
return False, left_masks, right_masks
|
|
|
|
# Step 7: Save combined masks
|
|
mask_path = os.path.join(segment_info['directory'], "mask.png")
|
|
sam2_processor.save_final_masks(
|
|
combined_masks,
|
|
mask_path,
|
|
green_color=config.get_green_color(),
|
|
blue_color=config.get_blue_color()
|
|
)
|
|
|
|
# Step 8: Apply green screen and save output video
|
|
success = mask_processor.process_segment(
|
|
segment_info,
|
|
combined_masks,
|
|
use_nvenc=config.get_use_nvenc(),
|
|
bitrate=config.get_output_bitrate()
|
|
)
|
|
|
|
if success:
|
|
logger.info(f"VR180 Separate Eyes: Successfully processed segment {segment_idx}")
|
|
else:
|
|
logger.error(f"VR180 Separate Eyes: Failed to create output video for segment {segment_idx}")
|
|
|
|
# Clean up temporary eye video files
|
|
try:
|
|
if os.path.exists(left_eye_video):
|
|
os.remove(left_eye_video)
|
|
if os.path.exists(right_eye_video):
|
|
os.remove(right_eye_video)
|
|
logger.debug(f"VR180 Separate Eyes: Cleaned up temporary eye videos for segment {segment_idx}")
|
|
except Exception as e:
|
|
logger.warning(f"VR180 Separate Eyes: Failed to clean up temporary eye videos: {e}")
|
|
|
|
return success, left_masks, right_masks
|
|
|
|
async def main_async():
|
|
"""Main processing pipeline with async optimizations."""
|
|
args = parse_arguments()
|
|
|
|
try:
|
|
# Load configuration
|
|
config = ConfigLoader(args.config)
|
|
|
|
# Setup logging
|
|
setup_logging(config.get_log_level(), args.log_file)
|
|
|
|
# Handle status check
|
|
if args.status:
|
|
output_dir = config.get_output_directory()
|
|
input_video = config.get_input_video_path()
|
|
video_name = os.path.splitext(os.path.basename(input_video))[0]
|
|
segments_dir = os.path.join(output_dir, f"{video_name}_segments")
|
|
print_processing_status(segments_dir)
|
|
return 0
|
|
|
|
# Handle segment cleanup
|
|
if args.cleanup_segment is not None:
|
|
output_dir = config.get_output_directory()
|
|
input_video = config.get_input_video_path()
|
|
video_name = os.path.splitext(os.path.basename(input_video))[0]
|
|
segments_dir = os.path.join(output_dir, f"{video_name}_segments")
|
|
segment_dir = os.path.join(segments_dir, f"segment_{args.cleanup_segment}")
|
|
|
|
if cleanup_incomplete_segment(segment_dir):
|
|
logger.info(f"Successfully cleaned up segment {args.cleanup_segment}")
|
|
return 0
|
|
else:
|
|
logger.error(f"Failed to clean up segment {args.cleanup_segment}")
|
|
return 1
|
|
|
|
logger.info("Starting YOLO + SAM2 video processing pipeline")
|
|
|
|
# Validate dependencies
|
|
if not validate_dependencies():
|
|
return 1
|
|
|
|
# Validate input video exists
|
|
input_video = config.get_input_video_path()
|
|
if not os.path.exists(input_video):
|
|
logger.error(f"Input video not found: {input_video}")
|
|
return 1
|
|
|
|
# Setup output directory
|
|
output_dir = config.get_output_directory()
|
|
ensure_directory(output_dir)
|
|
|
|
# Step 1: Split video into segments
|
|
logger.info("Step 1: Splitting video into segments")
|
|
splitter = VideoSplitter(
|
|
segment_duration=config.get_segment_duration(),
|
|
force_keyframes=config.get('video.force_keyframes', True)
|
|
)
|
|
|
|
segments_dir, segment_dirs = splitter.split_video(input_video, output_dir)
|
|
logger.info(f"Created {len(segment_dirs)} segments in {segments_dir}")
|
|
|
|
# Get detailed segment information
|
|
segments_info = splitter.get_segment_info(segments_dir)
|
|
|
|
# Resolve which segments to process with YOLO
|
|
detect_segments_config = config.get_detect_segments()
|
|
detect_segments = resolve_detect_segments(detect_segments_config, len(segments_info))
|
|
|
|
# Initialize processors once
|
|
logger.info("Step 2: Initializing YOLO detector")
|
|
|
|
# Get YOLO mode and model paths
|
|
yolo_mode = config.get('models.yolo_mode', 'detection')
|
|
detection_model = config.get('models.yolo_detection_model', config.get_yolo_model_path())
|
|
segmentation_model = config.get('models.yolo_segmentation_model', None)
|
|
|
|
logger.info(f"YOLO Mode: {yolo_mode}")
|
|
|
|
detector = YOLODetector(
|
|
detection_model_path=detection_model,
|
|
segmentation_model_path=segmentation_model,
|
|
mode=yolo_mode,
|
|
confidence_threshold=config.get_yolo_confidence(),
|
|
human_class_id=config.get_human_class_id()
|
|
)
|
|
|
|
logger.info("Step 3: Initializing SAM2 processor")
|
|
|
|
# Check if separate eye processing is enabled
|
|
separate_eye_processing = config.get('processing.separate_eye_processing', False)
|
|
eye_overlap_pixels = config.get('processing.eye_overlap_pixels', 0)
|
|
enable_greenscreen_fallback = config.get('processing.enable_greenscreen_fallback', True)
|
|
|
|
# Initialize async preprocessor if enabled
|
|
async_preprocessor = None
|
|
if config.get('advanced.enable_background_lowres_generation', False):
|
|
from core.async_lowres_preprocessor import AsyncLowResPreprocessor
|
|
|
|
max_concurrent = config.get('advanced.max_concurrent_lowres', 3)
|
|
segments_ahead = config.get('advanced.lowres_segments_ahead', 3)
|
|
use_ffmpeg = config.get('advanced.use_ffmpeg_lowres', True)
|
|
|
|
async_preprocessor = AsyncLowResPreprocessor(
|
|
max_concurrent=max_concurrent,
|
|
segments_ahead=segments_ahead,
|
|
use_ffmpeg=use_ffmpeg
|
|
)
|
|
logger.info(f"Async low-res preprocessing: ENABLED (max_concurrent={max_concurrent}, segments_ahead={segments_ahead})")
|
|
else:
|
|
logger.info("Async low-res preprocessing: DISABLED")
|
|
|
|
if separate_eye_processing:
|
|
logger.info("VR180 Separate Eye Processing: ENABLED")
|
|
logger.info(f"Eye overlap pixels: {eye_overlap_pixels}")
|
|
logger.info(f"Greenscreen fallback: {enable_greenscreen_fallback}")
|
|
|
|
sam2_processor = SAM2Processor(
|
|
checkpoint_path=config.get_sam2_checkpoint(),
|
|
config_path=config.get_sam2_config(),
|
|
vos_optimized=config.get('models.sam2_vos_optimized', False),
|
|
separate_eye_processing=separate_eye_processing,
|
|
eye_overlap_pixels=eye_overlap_pixels,
|
|
async_preprocessor=async_preprocessor
|
|
)
|
|
|
|
# Initialize mask processor with quality enhancements
|
|
mask_quality_config = config.get('mask_processing', {})
|
|
mask_processor = MaskProcessor(
|
|
green_color=config.get_green_color(),
|
|
blue_color=config.get_blue_color(),
|
|
mask_quality_config=mask_quality_config
|
|
)
|
|
|
|
# Process each segment sequentially (YOLO -> SAM2 -> Render)
|
|
logger.info("Step 4: Processing segments sequentially")
|
|
total_humans_detected = 0
|
|
|
|
# Start background low-res video preprocessing if enabled
|
|
if async_preprocessor:
|
|
logger.info("Starting background low-res video preprocessing")
|
|
async_preprocessor.start_background_preparation(
|
|
segments_info,
|
|
config.get_inference_scale(),
|
|
separate_eye_processing,
|
|
current_segment=0
|
|
)
|
|
|
|
# Initialize previous masks for separate eye processing
|
|
previous_left_masks = None
|
|
previous_right_masks = None
|
|
|
|
for i, segment_info in enumerate(segments_info):
|
|
segment_idx = segment_info['index']
|
|
|
|
logger.info(f"Processing segment {segment_idx}/{len(segments_info)-1}")
|
|
|
|
# Start background preparation for upcoming segments
|
|
if async_preprocessor and i < len(segments_info) - 1:
|
|
async_preprocessor.start_background_preparation(
|
|
segments_info,
|
|
config.get_inference_scale(),
|
|
separate_eye_processing,
|
|
current_segment=i
|
|
)
|
|
|
|
# Reset temporal history for new segment
|
|
mask_processor.reset_temporal_history()
|
|
|
|
# Skip if segment output already exists
|
|
output_video = os.path.join(segment_info['directory'], f"output_{segment_idx}.mp4")
|
|
if os.path.exists(output_video):
|
|
logger.info(f"Segment {segment_idx} already processed, skipping")
|
|
continue
|
|
|
|
# Branch based on processing mode
|
|
if separate_eye_processing:
|
|
# Use separate eye processing mode
|
|
success, left_masks, right_masks = process_segment_with_separate_eyes(
|
|
segment_info, detector, sam2_processor, mask_processor, config,
|
|
previous_left_masks, previous_right_masks
|
|
)
|
|
|
|
# Update previous masks for next segment
|
|
previous_left_masks = left_masks
|
|
previous_right_masks = right_masks
|
|
|
|
if success:
|
|
logger.info(f"Successfully processed segment {segment_idx} with separate eye processing")
|
|
else:
|
|
logger.error(f"Failed to process segment {segment_idx} with separate eye processing")
|
|
|
|
continue # Skip the original processing logic
|
|
|
|
# Determine if we should use YOLO detections or previous masks
|
|
use_detections = segment_idx in detect_segments
|
|
|
|
# First segment must use detections
|
|
if segment_idx == 0 and not use_detections:
|
|
logger.warning(f"First segment must use YOLO detection")
|
|
use_detections = True
|
|
|
|
# Get YOLO prompts or previous masks
|
|
yolo_prompts = None
|
|
previous_masks = None
|
|
|
|
if use_detections:
|
|
# Run YOLO detection on current segment
|
|
logger.info(f"Running YOLO detection on segment {segment_idx}")
|
|
detection_file = os.path.join(segment_info['directory'], "yolo_detections")
|
|
|
|
# Check if detection already exists
|
|
if os.path.exists(detection_file):
|
|
logger.info(f"Loading existing YOLO detections for segment {segment_idx}")
|
|
detections = detector.load_detections_from_file(detection_file)
|
|
else:
|
|
# Run YOLO detection on first frame
|
|
detections = detector.detect_humans_in_video_first_frame(
|
|
segment_info['video_file'],
|
|
scale=config.get_inference_scale()
|
|
)
|
|
# Save detections for future runs
|
|
detector.save_detections_to_file(detections, detection_file)
|
|
|
|
if detections:
|
|
total_humans_detected += len(detections)
|
|
logger.info(f"Found {len(detections)} humans in segment {segment_idx}")
|
|
|
|
# Get frame width from video
|
|
cap = cv2.VideoCapture(segment_info['video_file'])
|
|
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
cap.release()
|
|
|
|
yolo_prompts = detector.convert_detections_to_sam2_prompts(
|
|
detections, frame_width
|
|
)
|
|
|
|
# If no right eye detections found, run debug analysis with lower confidence
|
|
half_frame_width = frame_width // 2
|
|
right_eye_detections = [d for d in detections if (d['bbox'][0] + d['bbox'][2]) / 2 >= half_frame_width]
|
|
|
|
if len(right_eye_detections) == 0 and config.get('advanced.save_yolo_debug_frames', False):
|
|
logger.info(f"VR180 Debug: No right eye detections found, running lower confidence analysis...")
|
|
|
|
# Load first frame for debug analysis
|
|
cap = cv2.VideoCapture(segment_info['video_file'])
|
|
ret, debug_frame = cap.read()
|
|
cap.release()
|
|
|
|
if ret:
|
|
# Scale frame to match detection scale
|
|
if config.get_inference_scale() != 1.0:
|
|
scale = config.get_inference_scale()
|
|
debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
|
|
|
|
# Run debug detection with lower confidence
|
|
debug_detections = detector.debug_detect_with_lower_confidence(debug_frame, debug_confidence=0.3)
|
|
|
|
# Analyze where these lower confidence detections are
|
|
debug_right_eye = [d for d in debug_detections if (d['bbox'][0] + d['bbox'][2]) / 2 >= half_frame_width]
|
|
|
|
if len(debug_right_eye) > 0:
|
|
logger.warning(f"VR180 Debug: Found {len(debug_right_eye)} right eye detections with lower confidence!")
|
|
for i, det in enumerate(debug_right_eye):
|
|
logger.warning(f"VR180 Debug: Right eye detection {i+1}: conf={det['confidence']:.3f}, bbox={det['bbox']}")
|
|
logger.warning(f"VR180 Debug: Consider lowering yolo_confidence from {config.get_yolo_confidence()} to 0.3-0.4")
|
|
else:
|
|
logger.info(f"VR180 Debug: No right eye detections found even with confidence 0.3")
|
|
logger.info(f"VR180 Debug: This confirms person is not visible in right eye view")
|
|
|
|
logger.info(f"Pipeline Debug: Segment {segment_idx} - Generated {len(yolo_prompts)} SAM2 prompts from {len(detections)} YOLO detections")
|
|
|
|
# Save debug frame with detections visualized (if enabled)
|
|
if config.get('advanced.save_yolo_debug_frames', False):
|
|
debug_frame_path = os.path.join(segment_info['directory'], "yolo_debug.jpg")
|
|
|
|
# Load first frame for debug visualization
|
|
cap = cv2.VideoCapture(segment_info['video_file'])
|
|
ret, debug_frame = cap.read()
|
|
cap.release()
|
|
|
|
if ret:
|
|
# Scale frame to match detection scale
|
|
if config.get_inference_scale() != 1.0:
|
|
scale = config.get_inference_scale()
|
|
debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
|
|
|
|
detector.save_debug_frame_with_detections(debug_frame, detections, debug_frame_path, yolo_prompts)
|
|
else:
|
|
logger.warning(f"Could not load frame for debug visualization in segment {segment_idx}")
|
|
|
|
# Check if we have YOLO masks for debug visualization
|
|
has_yolo_masks = False
|
|
if detections and detector.supports_segmentation:
|
|
has_yolo_masks = any(d.get('has_mask', False) for d in detections)
|
|
|
|
# Generate first frame masks debug (SAM2 or YOLO)
|
|
first_frame_debug_path = os.path.join(segment_info['directory'], "first_frame_detection.jpg")
|
|
|
|
if has_yolo_masks:
|
|
logger.info(f"Pipeline Debug: Generating YOLO first frame masks for segment {segment_idx}")
|
|
# Create YOLO mask debug visualization
|
|
create_yolo_mask_debug_frame(detections, segment_info['video_file'], first_frame_debug_path, config.get_inference_scale())
|
|
else:
|
|
logger.info(f"Pipeline Debug: Generating SAM2 first frame masks for segment {segment_idx}")
|
|
sam2_processor.generate_first_frame_debug_masks(
|
|
segment_info['video_file'],
|
|
yolo_prompts,
|
|
first_frame_debug_path,
|
|
config.get_inference_scale()
|
|
)
|
|
else:
|
|
logger.warning(f"No humans detected in segment {segment_idx}")
|
|
|
|
# Save debug frame even when no detections (if enabled)
|
|
if config.get('advanced.save_yolo_debug_frames', False):
|
|
debug_frame_path = os.path.join(segment_info['directory'], "yolo_debug_no_detections.jpg")
|
|
|
|
# Load first frame for debug visualization
|
|
cap = cv2.VideoCapture(segment_info['video_file'])
|
|
ret, debug_frame = cap.read()
|
|
cap.release()
|
|
|
|
if ret:
|
|
# Scale frame to match detection scale
|
|
if config.get_inference_scale() != 1.0:
|
|
scale = config.get_inference_scale()
|
|
debug_frame = cv2.resize(debug_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
|
|
|
|
# Add "No detections" text overlay
|
|
cv2.putText(debug_frame, "YOLO: No humans detected",
|
|
(10, 30),
|
|
cv2.FONT_HERSHEY_SIMPLEX, 1.0,
|
|
(0, 0, 255), 2) # Red text
|
|
|
|
cv2.imwrite(debug_frame_path, debug_frame)
|
|
logger.info(f"Saved no-detection debug frame to {debug_frame_path}")
|
|
else:
|
|
logger.warning(f"Could not load frame for no-detection debug visualization in segment {segment_idx}")
|
|
elif segment_idx > 0:
|
|
# Try to load previous segment mask
|
|
for j in range(segment_idx - 1, -1, -1):
|
|
prev_segment_dir = segments_info[j]['directory']
|
|
previous_masks = sam2_processor.load_previous_segment_mask(prev_segment_dir)
|
|
if previous_masks:
|
|
logger.info(f"Using masks from segment {j} for segment {segment_idx}")
|
|
break
|
|
|
|
if not yolo_prompts and not previous_masks:
|
|
logger.error(f"No prompts or previous masks available for segment {segment_idx}")
|
|
continue
|
|
|
|
# Check if we have YOLO masks and can skip SAM2 (recheck in case detections were loaded from file)
|
|
if not 'has_yolo_masks' in locals():
|
|
has_yolo_masks = False
|
|
if detections and detector.supports_segmentation:
|
|
has_yolo_masks = any(d.get('has_mask', False) for d in detections)
|
|
|
|
if has_yolo_masks:
|
|
logger.info(f"Pipeline Debug: YOLO segmentation provided masks - using as SAM2 initial masks for segment {segment_idx}")
|
|
|
|
# Convert YOLO masks to initial masks for SAM2
|
|
cap = cv2.VideoCapture(segment_info['video_file'])
|
|
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|
cap.release()
|
|
|
|
# Convert YOLO masks to the format expected by SAM2 add_previous_masks_to_predictor
|
|
yolo_masks_dict = {}
|
|
for i, detection in enumerate(detections[:2]): # Up to 2 objects
|
|
if detection.get('has_mask', False):
|
|
mask = detection['mask']
|
|
# Resize mask to match inference scale
|
|
if config.get_inference_scale() != 1.0:
|
|
scale = config.get_inference_scale()
|
|
scaled_height = int(frame_height * scale)
|
|
scaled_width = int(frame_width * scale)
|
|
mask = cv2.resize(mask.astype(np.float32), (scaled_width, scaled_height), interpolation=cv2.INTER_NEAREST)
|
|
mask = mask > 0.5
|
|
|
|
obj_id = i + 1 # Sequential object IDs
|
|
yolo_masks_dict[obj_id] = mask.astype(bool)
|
|
logger.info(f"Pipeline Debug: YOLO mask for Object {obj_id} - shape: {mask.shape}, pixels: {np.sum(mask)}")
|
|
|
|
logger.info(f"Pipeline Debug: Using YOLO masks as SAM2 initial masks - {len(yolo_masks_dict)} objects")
|
|
|
|
# Use traditional SAM2 pipeline with YOLO masks as initial masks
|
|
previous_masks = yolo_masks_dict
|
|
yolo_prompts = None # Don't use bounding box prompts when we have masks
|
|
|
|
# Debug what we're passing to SAM2
|
|
if yolo_prompts:
|
|
logger.info(f"Pipeline Debug: Passing {len(yolo_prompts)} YOLO prompts to SAM2 for segment {segment_idx}")
|
|
for i, prompt in enumerate(yolo_prompts):
|
|
logger.info(f"Pipeline Debug: Prompt {i+1}: Object {prompt['obj_id']}, bbox={prompt['bbox']}")
|
|
|
|
if previous_masks:
|
|
logger.info(f"Pipeline Debug: Using {len(previous_masks)} previous masks for segment {segment_idx}")
|
|
logger.info(f"Pipeline Debug: Previous mask object IDs: {list(previous_masks.keys())}")
|
|
|
|
# Handle mid-segment detection if enabled (works for both detection and segmentation modes)
|
|
multi_frame_prompts = None
|
|
if config.get('advanced.enable_mid_segment_detection', False) and (yolo_prompts or has_yolo_masks):
|
|
logger.info(f"Mid-segment Detection: Enabled for segment {segment_idx}")
|
|
|
|
# Calculate frame indices for re-detection
|
|
cap = cv2.VideoCapture(segment_info['video_file'])
|
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
|
|
cap.release()
|
|
|
|
redetection_interval = config.get('advanced.redetection_interval', 30)
|
|
max_redetections = config.get('advanced.max_redetections_per_segment', 10)
|
|
|
|
# Generate frame indices: [30, 60, 90, ...] (skip frame 0 since we already have first frame prompts)
|
|
frame_indices = []
|
|
frame_idx = redetection_interval
|
|
while frame_idx < total_frames and len(frame_indices) < max_redetections:
|
|
frame_indices.append(frame_idx)
|
|
frame_idx += redetection_interval
|
|
|
|
if frame_indices:
|
|
logger.info(f"Mid-segment Detection: Running YOLO on frames {frame_indices} (interval={redetection_interval})")
|
|
|
|
# Run multi-frame detection
|
|
multi_frame_detections = detector.detect_humans_multi_frame(
|
|
segment_info['video_file'],
|
|
frame_indices,
|
|
scale=config.get_inference_scale()
|
|
)
|
|
|
|
# Convert detections to SAM2 prompts (different handling for segmentation vs detection mode)
|
|
multi_frame_prompts = {}
|
|
cap = cv2.VideoCapture(segment_info['video_file'])
|
|
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|
cap.release()
|
|
|
|
for frame_idx, detections in multi_frame_detections.items():
|
|
if detections:
|
|
if has_yolo_masks:
|
|
# Segmentation mode: convert YOLO masks to SAM2 mask prompts
|
|
frame_masks = {}
|
|
for i, detection in enumerate(detections[:2]): # Up to 2 objects
|
|
if detection.get('has_mask', False):
|
|
mask = detection['mask']
|
|
# Resize mask to match inference scale
|
|
if config.get_inference_scale() != 1.0:
|
|
scale = config.get_inference_scale()
|
|
scaled_height = int(frame_height * scale)
|
|
scaled_width = int(frame_width * scale)
|
|
mask = cv2.resize(mask.astype(np.float32), (scaled_width, scaled_height), interpolation=cv2.INTER_NEAREST)
|
|
mask = mask > 0.5
|
|
|
|
obj_id = i + 1 # Sequential object IDs
|
|
frame_masks[obj_id] = mask.astype(bool)
|
|
logger.debug(f"Mid-segment Detection: Frame {frame_idx}, Object {obj_id} mask - shape: {mask.shape}, pixels: {np.sum(mask)}")
|
|
|
|
if frame_masks:
|
|
# Store as mask prompts (different format than bbox prompts)
|
|
multi_frame_prompts[frame_idx] = {'masks': frame_masks}
|
|
logger.info(f"Mid-segment Detection: Frame {frame_idx} -> {len(frame_masks)} YOLO masks")
|
|
else:
|
|
# Detection mode: convert to bounding box prompts (existing logic)
|
|
prompts = detector.convert_detections_to_sam2_prompts(detections, frame_width)
|
|
multi_frame_prompts[frame_idx] = prompts
|
|
logger.info(f"Mid-segment Detection: Frame {frame_idx} -> {len(prompts)} SAM2 prompts")
|
|
|
|
logger.info(f"Mid-segment Detection: Generated prompts for {len(multi_frame_prompts)} frames")
|
|
else:
|
|
logger.info(f"Mid-segment Detection: No additional frames to process (segment has {total_frames} frames)")
|
|
elif config.get('advanced.enable_mid_segment_detection', False):
|
|
logger.info(f"Mid-segment Detection: Skipped for segment {segment_idx} (no initial YOLO data)")
|
|
|
|
# Process segment with SAM2
|
|
logger.info(f"Pipeline Debug: Starting SAM2 processing for segment {segment_idx}")
|
|
video_segments = sam2_processor.process_single_segment(
|
|
segment_info,
|
|
yolo_prompts=yolo_prompts,
|
|
previous_masks=previous_masks,
|
|
inference_scale=config.get_inference_scale(),
|
|
multi_frame_prompts=multi_frame_prompts
|
|
)
|
|
|
|
if video_segments is None:
|
|
logger.error(f"SAM2 processing failed for segment {segment_idx}")
|
|
continue
|
|
|
|
# Check if SAM2 produced adequate results
|
|
if len(video_segments) == 0:
|
|
logger.error(f"SAM2 produced no frames for segment {segment_idx}")
|
|
continue
|
|
elif len(video_segments) < 10: # Expected many frames for a 5-second segment
|
|
logger.warning(f"SAM2 produced very few frames ({len(video_segments)}) for segment {segment_idx} - this may indicate propagation failure")
|
|
|
|
# Debug what SAM2 produced
|
|
logger.info(f"Pipeline Debug: SAM2 completed for segment {segment_idx}")
|
|
logger.info(f"Pipeline Debug: Generated masks for {len(video_segments)} frames")
|
|
|
|
if video_segments:
|
|
# Check first frame to see what objects were tracked
|
|
first_frame_idx = min(video_segments.keys())
|
|
first_frame_objects = video_segments[first_frame_idx]
|
|
logger.info(f"Pipeline Debug: First frame contains {len(first_frame_objects)} tracked objects")
|
|
logger.info(f"Pipeline Debug: Tracked object IDs: {list(first_frame_objects.keys())}")
|
|
|
|
for obj_id, mask in first_frame_objects.items():
|
|
mask_pixels = np.sum(mask)
|
|
logger.info(f"Pipeline Debug: Object {obj_id} mask has {mask_pixels} pixels")
|
|
|
|
# Check last frame as well
|
|
last_frame_idx = max(video_segments.keys())
|
|
last_frame_objects = video_segments[last_frame_idx]
|
|
logger.info(f"Pipeline Debug: Last frame contains {len(last_frame_objects)} tracked objects")
|
|
logger.info(f"Pipeline Debug: Final object IDs: {list(last_frame_objects.keys())}")
|
|
|
|
# Save final masks for next segment
|
|
mask_path = os.path.join(segment_info['directory'], "mask.png")
|
|
sam2_processor.save_final_masks(
|
|
video_segments,
|
|
mask_path,
|
|
green_color=config.get_green_color(),
|
|
blue_color=config.get_blue_color()
|
|
)
|
|
|
|
# Apply green screen and save output video
|
|
success = mask_processor.process_segment(
|
|
segment_info,
|
|
video_segments,
|
|
use_nvenc=config.get_use_nvenc(),
|
|
bitrate=config.get_output_bitrate()
|
|
)
|
|
|
|
if success:
|
|
logger.info(f"Successfully processed segment {segment_idx}")
|
|
else:
|
|
logger.error(f"Failed to create green screen video for segment {segment_idx}")
|
|
|
|
# Log processing summary
|
|
logger.info(f"Sequential processing complete. Total humans detected: {total_humans_detected}")
|
|
|
|
# Step 3: Assemble final video
|
|
logger.info("Step 3: Assembling final video with audio")
|
|
|
|
# Initialize video assembler
|
|
assembler = VideoAssembler(
|
|
preserve_audio=config.get_preserve_audio(),
|
|
use_nvenc=config.get_use_nvenc()
|
|
)
|
|
|
|
# Verify all segments are complete
|
|
all_complete, missing = assembler.verify_segment_completeness(segments_dir)
|
|
|
|
if not all_complete:
|
|
logger.error(f"Cannot assemble video - missing segments: {missing}")
|
|
return 1
|
|
|
|
# Assemble final video
|
|
final_output = os.path.join(output_dir, config.get_output_filename())
|
|
|
|
success = assembler.assemble_final_video(
|
|
segments_dir,
|
|
input_video,
|
|
final_output,
|
|
bitrate=config.get_output_bitrate()
|
|
)
|
|
|
|
if success:
|
|
logger.info(f"Final video saved to: {final_output}")
|
|
|
|
logger.info("Pipeline completed successfully")
|
|
return 0
|
|
|
|
except Exception as e:
|
|
logger.error(f"Pipeline failed: {e}", exc_info=True)
|
|
return 1
|
|
finally:
|
|
# Cleanup async preprocessor if it was used
|
|
if async_preprocessor:
|
|
async_preprocessor.cleanup()
|
|
logger.debug("Async preprocessor cleanup completed")
|
|
|
|
def main():
|
|
"""Main entry point - wrapper for async main."""
|
|
import asyncio
|
|
return asyncio.run(main_async())
|
|
|
|
if __name__ == "__main__":
|
|
exit_code = main()
|
|
sys.exit(exit_code) |