samyolo_on_segments/core/eye_processor.py

"""
Eye processor module for VR180 separate eye processing.
Handles splitting VR180 side-by-side frames into separate left/right eyes and recombining.
"""

import os
import cv2
import numpy as np
import logging
import subprocess
from typing import Dict, List, Any, Optional, Tuple

logger = logging.getLogger(__name__)

class EyeProcessor:
    """Handles VR180 eye-specific processing operations."""

    def __init__(self, eye_overlap_pixels: int = 0):
        """
        Initialize eye processor.

        Args:
            eye_overlap_pixels: Number of pixels to overlap between eyes for blending
        """
        self.eye_overlap_pixels = eye_overlap_pixels

    def split_frame_into_eyes(self, frame: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Split a VR180 side-by-side frame into separate left and right eye frames.

        Args:
            frame: Input VR180 frame (BGR format)

        Returns:
            Tuple of (left_eye_frame, right_eye_frame)
        """
        if len(frame.shape) != 3:
            raise ValueError("Frame must be a 3-channel BGR image")

        height, width, channels = frame.shape
        half_width = width // 2

        # Extract left and right eye frames
        left_eye = frame[:, :half_width + self.eye_overlap_pixels, :]
        right_eye = frame[:, half_width - self.eye_overlap_pixels:, :]

        logger.debug(f"Split frame {width}x{height} into left: {left_eye.shape} and right: {right_eye.shape}")

        return left_eye, right_eye

    def split_video_into_eyes(self, input_video_path: str, left_output_path: str,
                            right_output_path: str, scale: float = 1.0) -> bool:
        """
        Split a VR180 video into separate left and right eye videos using FFmpeg.

        Args:
            input_video_path: Path to input VR180 video
            left_output_path: Output path for left eye video
            right_output_path: Output path for right eye video
            scale: Scale factor for output videos (default: 1.0)

        Returns:
            True if successful, False otherwise
        """
        try:
            # Get video properties
            cap = cv2.VideoCapture(input_video_path)
            if not cap.isOpened():
                logger.error(f"Could not open video: {input_video_path}")
                return False

            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            fps = cap.get(cv2.CAP_PROP_FPS)
            cap.release()

            # Calculate output dimensions
            half_width = int((width // 2) * scale)
            output_height = int(height * scale)

            # Create output directories if they don't exist
            os.makedirs(os.path.dirname(left_output_path), exist_ok=True)
            os.makedirs(os.path.dirname(right_output_path), exist_ok=True)

            # FFmpeg command for left eye (crop left half)
            left_command = [
                'ffmpeg', '-y',
                '-i', input_video_path,
                '-vf', f'crop={width//2 + self.eye_overlap_pixels}:{height}:0:0,scale={half_width}:{output_height}',
                '-c:v', 'libx264',
                '-preset', 'fast',
                '-crf', '18',
                left_output_path
            ]

            # FFmpeg command for right eye (crop right half)
            right_command = [
                'ffmpeg', '-y',
                '-i', input_video_path,
                '-vf', f'crop={width//2 + self.eye_overlap_pixels}:{height}:{width//2 - self.eye_overlap_pixels}:0,scale={half_width}:{output_height}',
                '-c:v', 'libx264',
                '-preset', 'fast',
                '-crf', '18',
                right_output_path
            ]

            logger.info(f"Splitting video into left eye: {left_output_path}")
            result_left = subprocess.run(left_command, capture_output=True, text=True)
            if result_left.returncode != 0:
                logger.error(f"FFmpeg failed for left eye: {result_left.stderr}")
                return False

            logger.info(f"Splitting video into right eye: {right_output_path}")
            result_right = subprocess.run(right_command, capture_output=True, text=True)
            if result_right.returncode != 0:
                logger.error(f"FFmpeg failed for right eye: {result_right.stderr}")
                return False

            logger.info(f"Successfully split video into separate eye videos")
            return True

        except Exception as e:
            logger.error(f"Error splitting video into eyes: {e}")
            return False

    def combine_eye_masks(self, left_masks: Optional[Dict[int, np.ndarray]],
                         right_masks: Optional[Dict[int, np.ndarray]],
                         full_frame_shape: Tuple[int, int]) -> Dict[int, np.ndarray]:
        """
        Combine left and right eye masks back into full-frame format.

        Args:
            left_masks: Dictionary of masks from left eye processing (frame_idx -> mask)
            right_masks: Dictionary of masks from right eye processing (frame_idx -> mask)
            full_frame_shape: Shape of the full VR180 frame (height, width)

        Returns:
            Dictionary of combined masks in full-frame format
        """
        combined_masks = {}
        full_height, full_width = full_frame_shape
        half_width = full_width // 2

        # Get all frame indices from both eyes
        left_frames = set(left_masks.keys()) if left_masks else set()
        right_frames = set(right_masks.keys()) if right_masks else set()
        all_frames = left_frames.union(right_frames)

        for frame_idx in all_frames:
            # Create full-frame mask
            combined_mask = np.zeros((full_height, full_width), dtype=np.uint8)

            # Add left eye mask to left half of frame
            if left_masks and frame_idx in left_masks:
                left_mask = left_masks[frame_idx]
                if len(left_mask.shape) == 3:
                    left_mask = left_mask.squeeze()

                # Resize left mask to fit left half of full frame
                left_target_width = half_width + self.eye_overlap_pixels
                if left_mask.shape != (full_height, left_target_width):
                    left_mask = cv2.resize(left_mask.astype(np.uint8),
                                         (left_target_width, full_height),
                                         interpolation=cv2.INTER_NEAREST)

                # Place in left half of combined mask
                combined_mask[:, :left_target_width] = left_mask[:, :left_target_width]

            # Add right eye mask to right half of frame
            if right_masks and frame_idx in right_masks:
                right_mask = right_masks[frame_idx]
                if len(right_mask.shape) == 3:
                    right_mask = right_mask.squeeze()

                # Resize right mask to fit right half of full frame
                right_target_width = half_width + self.eye_overlap_pixels
                right_start_x = half_width - self.eye_overlap_pixels

                if right_mask.shape != (full_height, right_target_width):
                    right_mask = cv2.resize(right_mask.astype(np.uint8),
                                          (right_target_width, full_height),
                                          interpolation=cv2.INTER_NEAREST)

                # Place in right half of combined mask
                combined_mask[:, right_start_x:] = right_mask

            # Store combined mask for this frame (using object ID 1 for simplicity)
            combined_masks[frame_idx] = {1: combined_mask}

        logger.debug(f"Combined {len(combined_masks)} frame masks from left/right eyes")
        return combined_masks

    def is_in_left_half(self, detection: Dict[str, Any], frame_width: int) -> bool:
        """
        Check if a detection is in the left half of a VR180 frame.

        Args:
            detection: YOLO detection dictionary with 'bbox' key
            frame_width: Width of the full VR180 frame

        Returns:
            True if detection center is in left half
        """
        bbox = detection['bbox']
        center_x = (bbox[0] + bbox[2]) / 2
        return center_x < (frame_width // 2)

    def is_in_right_half(self, detection: Dict[str, Any], frame_width: int) -> bool:
        """
        Check if a detection is in the right half of a VR180 frame.

        Args:
            detection: YOLO detection dictionary with 'bbox' key
            frame_width: Width of the full VR180 frame

        Returns:
            True if detection center is in right half
        """
        return not self.is_in_left_half(detection, frame_width)

    def convert_detection_to_eye_coordinates(self, detection: Dict[str, Any],
                                           eye_side: str, frame_width: int) -> Dict[str, Any]:
        """
        Convert a full-frame detection to eye-specific coordinates.

        Args:
            detection: YOLO detection dictionary with 'bbox' key
            eye_side: 'left' or 'right'
            frame_width: Width of the full VR180 frame

        Returns:
            Detection with converted coordinates for the specific eye
        """
        bbox = detection['bbox'].copy()
        half_width = frame_width // 2

        if eye_side == 'right':
            # Shift right eye coordinates to start from 0
            bbox[0] -= (half_width - self.eye_overlap_pixels)  # x1
            bbox[2] -= (half_width - self.eye_overlap_pixels)  # x2

        # Ensure coordinates are within bounds
        eye_width = half_width + self.eye_overlap_pixels
        bbox[0] = max(0, min(bbox[0], eye_width - 1))
        bbox[2] = max(0, min(bbox[2], eye_width - 1))

        converted_detection = detection.copy()
        converted_detection['bbox'] = bbox

        return converted_detection

    def create_full_greenscreen_frame(self, frame_shape: Tuple[int, int, int],
                                    green_color: List[int] = [0, 255, 0]) -> np.ndarray:
        """
        Create a full greenscreen frame for fallback when no humans are detected.

        Args:
            frame_shape: Shape of the frame (height, width, channels)
            green_color: RGB values for green screen color

        Returns:
            Full greenscreen frame
        """
        greenscreen_frame = np.full(frame_shape, green_color, dtype=np.uint8)
        logger.debug(f"Created full greenscreen frame with shape {frame_shape}")
        return greenscreen_frame