analyze

2025-07-26 15:10:34 -07:00
parent ba8706b7ae
commit 9f572d4430
1 changed files with 193 additions and 0 deletions
--- a/analyze_memory_profile.py
+++ b/analyze_memory_profile.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""
+Analyze memory profile JSON files to identify OOM causes
+"""
+
+import json
+import glob
+import os
+import sys
+from pathlib import Path
+
+def analyze_memory_files():
+    """Analyze partial memory profile files"""
+    
+    # Get all partial files in order
+    files = sorted(glob.glob('memory_profile_partial_*.json'))
+    
+    if not files:
+        print("❌ No memory profile files found!")
+        print("Expected files like: memory_profile_partial_0.json")
+        return
+        
+    print(f"🔍 Found {len(files)} memory profile files")
+    print("=" * 60)
+    
+    peak_memory = 0
+    peak_vram = 0
+    critical_points = []
+    all_checkpoints = []
+    
+    for i, file in enumerate(files):
+        try:
+            with open(file, 'r') as f:
+                data = json.load(f)
+            
+            timeline = data.get('timeline', [])
+            if not timeline:
+                continue
+                
+            # Find peaks in this file
+            file_peak_rss = max([d['rss_gb'] for d in timeline])
+            file_peak_vram = max([d['vram_gb'] for d in timeline])
+            
+            if file_peak_rss > peak_memory:
+                peak_memory = file_peak_rss
+            if file_peak_vram > peak_vram:
+                peak_vram = file_peak_vram
+                
+            # Find memory growth spikes (>3GB increase)
+            for j in range(1, len(timeline)):
+                prev_rss = timeline[j-1]['rss_gb']
+                curr_rss = timeline[j]['rss_gb']
+                growth = curr_rss - prev_rss
+                
+                if growth > 3.0:  # >3GB growth spike
+                    checkpoint = timeline[j].get('checkpoint', f'sample_{j}')
+                    critical_points.append({
+                        'file': file,
+                        'file_index': i,
+                        'sample': j,
+                        'timestamp': timeline[j]['timestamp'],
+                        'rss_gb': curr_rss,
+                        'vram_gb': timeline[j]['vram_gb'],
+                        'growth_gb': growth,
+                        'checkpoint': checkpoint
+                    })
+                    
+            # Collect all checkpoints
+            checkpoints = [d for d in timeline if 'checkpoint' in d]
+            for cp in checkpoints:
+                cp['file'] = file
+                cp['file_index'] = i
+                all_checkpoints.append(cp)
+                
+            # Show progress for this file
+            if timeline:
+                start_rss = timeline[0]['rss_gb']
+                end_rss = timeline[-1]['rss_gb']
+                growth = end_rss - start_rss
+                samples = len(timeline)
+                
+                print(f"📊 File {i+1:2d}: {start_rss:5.1f}GB → {end_rss:5.1f}GB "
+                      f"(+{growth:4.1f}GB) [{samples:3d} samples]")
+                
+                # Show significant checkpoints from this file
+                if checkpoints:
+                    for cp in checkpoints:
+                        print(f"    📍 {cp['checkpoint']}: {cp['rss_gb']:.1f}GB")
+                        
+        except Exception as e:
+            print(f"❌ Error reading {file}: {e}")
+    
+    print("\n" + "=" * 60)
+    print("🎯 ANALYSIS SUMMARY")
+    print("=" * 60)
+    
+    print(f"📈 Peak Memory: {peak_memory:.1f} GB")
+    print(f"🎮 Peak VRAM: {peak_vram:.1f} GB")
+    print(f"⚡ Growth Spikes: {len(critical_points)} events >3GB")
+    
+    if critical_points:
+        print(f"\n💥 MEMORY GROWTH SPIKES (>3GB):")
+        print("   Location                           Growth   Total   VRAM")
+        print("   " + "-" * 55)
+        
+        for point in critical_points:
+            location = point['checkpoint'][:30].ljust(30)
+            print(f"   {location} +{point['growth_gb']:4.1f}GB → {point['rss_gb']:5.1f}GB  {point['vram_gb']:4.1f}GB")
+    
+    if all_checkpoints:
+        print(f"\n📍 CHECKPOINT PROGRESSION:")
+        print("   Checkpoint                         Memory   VRAM    File")
+        print("   " + "-" * 55)
+        
+        for cp in all_checkpoints:
+            checkpoint = cp['checkpoint'][:30].ljust(30)
+            file_num = cp['file_index'] + 1
+            print(f"   {checkpoint} {cp['rss_gb']:5.1f}GB  {cp['vram_gb']:4.1f}GB  #{file_num}")
+    
+    # Memory growth analysis
+    if len(all_checkpoints) > 1:
+        print(f"\n📊 MEMORY GROWTH ANALYSIS:")
+        
+        # Find the biggest memory jumps between checkpoints
+        big_jumps = []
+        for i in range(1, len(all_checkpoints)):
+            prev_cp = all_checkpoints[i-1]
+            curr_cp = all_checkpoints[i]
+            
+            growth = curr_cp['rss_gb'] - prev_cp['rss_gb']
+            if growth > 2.0:  # >2GB jump
+                big_jumps.append({
+                    'from': prev_cp['checkpoint'],
+                    'to': curr_cp['checkpoint'],
+                    'growth': growth,
+                    'from_memory': prev_cp['rss_gb'],
+                    'to_memory': curr_cp['rss_gb']
+                })
+        
+        if big_jumps:
+            print("   Major jumps (>2GB):")
+            for jump in big_jumps:
+                print(f"   {jump['from']} → {jump['to']}: "
+                      f"+{jump['growth']:.1f}GB ({jump['from_memory']:.1f}→{jump['to_memory']:.1f}GB)")
+        else:
+            print("   ✅ No major memory jumps detected")
+    
+    # Diagnosis
+    print(f"\n🔬 DIAGNOSIS:")
+    
+    if peak_memory > 400:
+        print("   🔴 CRITICAL: Memory usage exceeded 400GB")
+        print("   💡 Recommendation: Reduce chunk_size to 200-300 frames")
+    elif peak_memory > 200:
+        print("   🟡 HIGH: Memory usage over 200GB")
+        print("   💡 Recommendation: Reduce chunk_size to 400 frames")
+    else:
+        print("   🟢 MODERATE: Memory usage under 200GB")
+    
+    if critical_points:
+        # Find most common growth spike locations
+        spike_locations = {}
+        for point in critical_points:
+            location = point['checkpoint']
+            spike_locations[location] = spike_locations.get(location, 0) + 1
+        
+        print("\n   🎯 Most problematic locations:")
+        for location, count in sorted(spike_locations.items(), key=lambda x: x[1], reverse=True)[:3]:
+            print(f"      {location}: {count} spikes")
+    
+    print(f"\n💡 NEXT STEPS:")
+    if 'merge' in str(critical_points).lower():
+        print("   1. Chunk merging still causing memory accumulation")
+        print("   2. Check if streaming merge is actually being used")
+        print("   3. Verify chunk files are being deleted immediately")
+    elif 'propagation' in str(critical_points).lower():
+        print("   1. SAM2 propagation using too much memory")
+        print("   2. Reduce chunk_size further (try 300 frames)")
+        print("   3. Enable more aggressive frame release")
+    else:
+        print("   1. Review the checkpoint progression above")
+        print("   2. Focus on locations with biggest memory spikes")
+        print("   3. Consider reducing chunk_size if spikes are large")
+
+def main():
+    print("🔍 MEMORY PROFILE ANALYZER")
+    print("Analyzing memory profile files for OOM causes...")
+    print()
+    
+    analyze_memory_files()
+
+if __name__ == "__main__":
+    main()