analyze
This commit is contained in:
193
analyze_memory_profile.py
Normal file
193
analyze_memory_profile.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze memory profile JSON files to identify OOM causes
|
||||
"""
|
||||
|
||||
import json
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
def analyze_memory_files():
|
||||
"""Analyze partial memory profile files"""
|
||||
|
||||
# Get all partial files in order
|
||||
files = sorted(glob.glob('memory_profile_partial_*.json'))
|
||||
|
||||
if not files:
|
||||
print("❌ No memory profile files found!")
|
||||
print("Expected files like: memory_profile_partial_0.json")
|
||||
return
|
||||
|
||||
print(f"🔍 Found {len(files)} memory profile files")
|
||||
print("=" * 60)
|
||||
|
||||
peak_memory = 0
|
||||
peak_vram = 0
|
||||
critical_points = []
|
||||
all_checkpoints = []
|
||||
|
||||
for i, file in enumerate(files):
|
||||
try:
|
||||
with open(file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
timeline = data.get('timeline', [])
|
||||
if not timeline:
|
||||
continue
|
||||
|
||||
# Find peaks in this file
|
||||
file_peak_rss = max([d['rss_gb'] for d in timeline])
|
||||
file_peak_vram = max([d['vram_gb'] for d in timeline])
|
||||
|
||||
if file_peak_rss > peak_memory:
|
||||
peak_memory = file_peak_rss
|
||||
if file_peak_vram > peak_vram:
|
||||
peak_vram = file_peak_vram
|
||||
|
||||
# Find memory growth spikes (>3GB increase)
|
||||
for j in range(1, len(timeline)):
|
||||
prev_rss = timeline[j-1]['rss_gb']
|
||||
curr_rss = timeline[j]['rss_gb']
|
||||
growth = curr_rss - prev_rss
|
||||
|
||||
if growth > 3.0: # >3GB growth spike
|
||||
checkpoint = timeline[j].get('checkpoint', f'sample_{j}')
|
||||
critical_points.append({
|
||||
'file': file,
|
||||
'file_index': i,
|
||||
'sample': j,
|
||||
'timestamp': timeline[j]['timestamp'],
|
||||
'rss_gb': curr_rss,
|
||||
'vram_gb': timeline[j]['vram_gb'],
|
||||
'growth_gb': growth,
|
||||
'checkpoint': checkpoint
|
||||
})
|
||||
|
||||
# Collect all checkpoints
|
||||
checkpoints = [d for d in timeline if 'checkpoint' in d]
|
||||
for cp in checkpoints:
|
||||
cp['file'] = file
|
||||
cp['file_index'] = i
|
||||
all_checkpoints.append(cp)
|
||||
|
||||
# Show progress for this file
|
||||
if timeline:
|
||||
start_rss = timeline[0]['rss_gb']
|
||||
end_rss = timeline[-1]['rss_gb']
|
||||
growth = end_rss - start_rss
|
||||
samples = len(timeline)
|
||||
|
||||
print(f"📊 File {i+1:2d}: {start_rss:5.1f}GB → {end_rss:5.1f}GB "
|
||||
f"(+{growth:4.1f}GB) [{samples:3d} samples]")
|
||||
|
||||
# Show significant checkpoints from this file
|
||||
if checkpoints:
|
||||
for cp in checkpoints:
|
||||
print(f" 📍 {cp['checkpoint']}: {cp['rss_gb']:.1f}GB")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error reading {file}: {e}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("🎯 ANALYSIS SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
print(f"📈 Peak Memory: {peak_memory:.1f} GB")
|
||||
print(f"🎮 Peak VRAM: {peak_vram:.1f} GB")
|
||||
print(f"⚡ Growth Spikes: {len(critical_points)} events >3GB")
|
||||
|
||||
if critical_points:
|
||||
print(f"\n💥 MEMORY GROWTH SPIKES (>3GB):")
|
||||
print(" Location Growth Total VRAM")
|
||||
print(" " + "-" * 55)
|
||||
|
||||
for point in critical_points:
|
||||
location = point['checkpoint'][:30].ljust(30)
|
||||
print(f" {location} +{point['growth_gb']:4.1f}GB → {point['rss_gb']:5.1f}GB {point['vram_gb']:4.1f}GB")
|
||||
|
||||
if all_checkpoints:
|
||||
print(f"\n📍 CHECKPOINT PROGRESSION:")
|
||||
print(" Checkpoint Memory VRAM File")
|
||||
print(" " + "-" * 55)
|
||||
|
||||
for cp in all_checkpoints:
|
||||
checkpoint = cp['checkpoint'][:30].ljust(30)
|
||||
file_num = cp['file_index'] + 1
|
||||
print(f" {checkpoint} {cp['rss_gb']:5.1f}GB {cp['vram_gb']:4.1f}GB #{file_num}")
|
||||
|
||||
# Memory growth analysis
|
||||
if len(all_checkpoints) > 1:
|
||||
print(f"\n📊 MEMORY GROWTH ANALYSIS:")
|
||||
|
||||
# Find the biggest memory jumps between checkpoints
|
||||
big_jumps = []
|
||||
for i in range(1, len(all_checkpoints)):
|
||||
prev_cp = all_checkpoints[i-1]
|
||||
curr_cp = all_checkpoints[i]
|
||||
|
||||
growth = curr_cp['rss_gb'] - prev_cp['rss_gb']
|
||||
if growth > 2.0: # >2GB jump
|
||||
big_jumps.append({
|
||||
'from': prev_cp['checkpoint'],
|
||||
'to': curr_cp['checkpoint'],
|
||||
'growth': growth,
|
||||
'from_memory': prev_cp['rss_gb'],
|
||||
'to_memory': curr_cp['rss_gb']
|
||||
})
|
||||
|
||||
if big_jumps:
|
||||
print(" Major jumps (>2GB):")
|
||||
for jump in big_jumps:
|
||||
print(f" {jump['from']} → {jump['to']}: "
|
||||
f"+{jump['growth']:.1f}GB ({jump['from_memory']:.1f}→{jump['to_memory']:.1f}GB)")
|
||||
else:
|
||||
print(" ✅ No major memory jumps detected")
|
||||
|
||||
# Diagnosis
|
||||
print(f"\n🔬 DIAGNOSIS:")
|
||||
|
||||
if peak_memory > 400:
|
||||
print(" 🔴 CRITICAL: Memory usage exceeded 400GB")
|
||||
print(" 💡 Recommendation: Reduce chunk_size to 200-300 frames")
|
||||
elif peak_memory > 200:
|
||||
print(" 🟡 HIGH: Memory usage over 200GB")
|
||||
print(" 💡 Recommendation: Reduce chunk_size to 400 frames")
|
||||
else:
|
||||
print(" 🟢 MODERATE: Memory usage under 200GB")
|
||||
|
||||
if critical_points:
|
||||
# Find most common growth spike locations
|
||||
spike_locations = {}
|
||||
for point in critical_points:
|
||||
location = point['checkpoint']
|
||||
spike_locations[location] = spike_locations.get(location, 0) + 1
|
||||
|
||||
print("\n 🎯 Most problematic locations:")
|
||||
for location, count in sorted(spike_locations.items(), key=lambda x: x[1], reverse=True)[:3]:
|
||||
print(f" {location}: {count} spikes")
|
||||
|
||||
print(f"\n💡 NEXT STEPS:")
|
||||
if 'merge' in str(critical_points).lower():
|
||||
print(" 1. Chunk merging still causing memory accumulation")
|
||||
print(" 2. Check if streaming merge is actually being used")
|
||||
print(" 3. Verify chunk files are being deleted immediately")
|
||||
elif 'propagation' in str(critical_points).lower():
|
||||
print(" 1. SAM2 propagation using too much memory")
|
||||
print(" 2. Reduce chunk_size further (try 300 frames)")
|
||||
print(" 3. Enable more aggressive frame release")
|
||||
else:
|
||||
print(" 1. Review the checkpoint progression above")
|
||||
print(" 2. Focus on locations with biggest memory spikes")
|
||||
print(" 3. Consider reducing chunk_size if spikes are large")
|
||||
|
||||
def main():
|
||||
print("🔍 MEMORY PROFILE ANALYZER")
|
||||
print("Analyzing memory profile files for OOM causes...")
|
||||
print()
|
||||
|
||||
analyze_memory_files()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user